"""
Modal application to serve a custom network-engineering language model via
vLLM in OpenAI-compatible mode.

Deploy with:
    modal deploy network_llm_modal.py

Then call it with:
    base_url = "https://<your-endpoint>/v1"
    model = "network-llm"
    api_key = <Modal API token>

This script:
  - Builds an image with vLLM and bitsandbytes.
  - Caches Hugging Face downloads to speed cold starts.
  - Starts vLLM with a chat template and reduced context window to fit a
    single H100.
"""

from typing import Any
import subprocess
import tempfile

import modal

# Model and serving configuration
MODEL_NAME = "Irfanuruchi/Llama-3.1-8B-Computer-Networks-LLM"
ALIAS_NAME = "network-llm"
FAST_BOOT = True  # use eager mode for faster cold starts
N_GPU = 1
VLLM_PORT = 8000
MAX_MODEL_LEN = 8192  # keep KV cache within a single H100
GPU_MEMORY_UTILIZATION = 0.90

# Simple chat template for vLLM (Llama-style headers)
CHAT_TEMPLATE = """{% for m in messages -%}
{{ '<|start_header_id|>' + m['role'] + '<|end_header_id|>\n\n' + m['content'] + '<|eot_id|>' }}
{% endfor -%}
{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"""

# Caches for model weights and vLLM artifacts
hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)

# Build image with vLLM + bitsandbytes
vllm_image = (
    modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.12")
    .entrypoint([])
    .uv_pip_install(
        "vllm==0.11.2",
        "huggingface-hub==0.36.0",
        "flashinfer-python==0.5.2",
        "bitsandbytes>=0.46.1",
    )
    .env({"HF_XET_HIGH_PERFORMANCE": "1"})
)

app = modal.App("network-llm-modal")


@app.function(
    image=vllm_image,
    gpu=f"H100:{N_GPU}",
    scaledown_window=15 * 60,
    timeout=10 * 60,
    volumes={
        "/root/.cache/huggingface": hf_cache_vol,
        "/root/.cache/vllm": vllm_cache_vol,
    },
)
@modal.concurrent(max_inputs=32)
@modal.web_server(port=VLLM_PORT, startup_timeout=10 * 60)
def serve() -> None:
    """Launch a vLLM server inside the container."""
    cmd: list[str] = [
        "vllm",
        "serve",
        MODEL_NAME,
        "--served-model-name",
        ALIAS_NAME,
        "--host",
        "0.0.0.0",
        "--port",
        str(VLLM_PORT),
    ]

    # Startup/throughput tradeoff
    cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"]
    cmd += ["--tensor-parallel-size", str(N_GPU)]
    cmd += ["--max-model-len", str(MAX_MODEL_LEN)]
    cmd += ["--gpu-memory-utilization", str(GPU_MEMORY_UTILIZATION)]

    # Write chat template to a temp file and point vLLM at it
    with tempfile.NamedTemporaryFile(mode="w", delete=False) as tf:
        tf.write(CHAT_TEMPLATE)
        chat_template_path = tf.name
    cmd += ["--chat-template", chat_template_path]

    print("Launching vLLM with command:", " ".join(cmd))
    subprocess.Popen(" ".join(cmd), shell=True)


@app.local_entrypoint()
async def test() -> None:
    """Local test entrypoint: health check + simple chat completion."""
    import asyncio
    import aiohttp

    url = serve.get_web_url()
    print(f"Test endpoint at {url}")

    async with aiohttp.ClientSession(base_url=url) as session:
        async with session.get("/health") as resp:
            print("Health check status:", resp.status)
            assert resp.status == 200, "Server health check failed"

        messages = [
            {"role": "system", "content": "You are a helpful network engineer."},
            {"role": "user", "content": "What does OSPF do in a network?"},
        ]
        payload: dict[str, Any] = {"messages": messages, "model": ALIAS_NAME, "stream": False}
        headers = {"Content-Type": "application/json"}

        async with session.post("/v1/chat/completions", json=payload, headers=headers) as resp:
            print("Chat completion status:", resp.status)
            result = await resp.text()
            print("Response:", result)

        await asyncio.sleep(1)