Spaces:

MCP-1st-Birthday
/

overgrowth

Running

App Files Files Community

Graham Paasch commited on 10 days ago

Commit

6b6e30e

1 Parent(s): b64e5d2

Add Modal vLLM deployment script for network model

Browse files

Files changed (1) hide show

network_llm_modal.py +129 -0

network_llm_modal.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""
+Modal application to serve a custom network-engineering language model via
+vLLM in OpenAI-compatible mode.
+Deploy with:
+    modal deploy network_llm_modal.py
+Then call it with:
+    base_url = "https://<your-endpoint>/v1"
+    model = "network-llm"
+    api_key = <Modal API token>
+This script:
+  - Builds an image with vLLM and bitsandbytes.
+  - Caches Hugging Face downloads to speed cold starts.
+  - Starts vLLM with a chat template and reduced context window to fit a
+    single H100.
+"""
+from typing import Any
+import subprocess
+import tempfile
+import modal
+# Model and serving configuration
+MODEL_NAME = "Irfanuruchi/Llama-3.1-8B-Computer-Networks-LLM"
+ALIAS_NAME = "network-llm"
+FAST_BOOT = True  # use eager mode for faster cold starts
+N_GPU = 1
+VLLM_PORT = 8000
+MAX_MODEL_LEN = 8192  # keep KV cache within a single H100
+GPU_MEMORY_UTILIZATION = 0.90
+# Simple chat template for vLLM (Llama-style headers)
+CHAT_TEMPLATE = """{% for m in messages -%}
+{{ '<|start_header_id|>' + m['role'] + '<|end_header_id|>\n\n' + m['content'] + '<|eot_id|>' }}
+{% endfor -%}
+{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"""
+# Caches for model weights and vLLM artifacts
+hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
+vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
+# Build image with vLLM + bitsandbytes
+vllm_image = (
+    modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.12")
+    .entrypoint([])
+    .uv_pip_install(
+        "vllm==0.11.2",
+        "huggingface-hub==0.36.0",
+        "flashinfer-python==0.5.2",
+        "bitsandbytes>=0.46.1",
+    )
+    .env({"HF_XET_HIGH_PERFORMANCE": "1"})
+)
+app = modal.App("network-llm-modal")
+@app.function(
+    image=vllm_image,
+    gpu=f"H100:{N_GPU}",
+    scaledown_window=15 * 60,
+    timeout=10 * 60,
+    volumes={
+        "/root/.cache/huggingface": hf_cache_vol,
+        "/root/.cache/vllm": vllm_cache_vol,
+    },
+)
+@modal.concurrent(max_inputs=32)
+@modal.web_server(port=VLLM_PORT, startup_timeout=10 * 60)
+def serve() -> None:
+    """Launch a vLLM server inside the container."""
+    cmd: list[str] = [
+        "vllm",
+        "serve",
+        MODEL_NAME,
+        "--served-model-name",
+        ALIAS_NAME,
+        "--host",
+        "0.0.0.0",
+        "--port",
+        str(VLLM_PORT),
+    ]
+    # Startup/throughput tradeoff
+    cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"]
+    cmd += ["--tensor-parallel-size", str(N_GPU)]
+    cmd += ["--max-model-len", str(MAX_MODEL_LEN)]
+    cmd += ["--gpu-memory-utilization", str(GPU_MEMORY_UTILIZATION)]
+    # Write chat template to a temp file and point vLLM at it
+    with tempfile.NamedTemporaryFile(mode="w", delete=False) as tf:
+        tf.write(CHAT_TEMPLATE)
+        chat_template_path = tf.name
+    cmd += ["--chat-template", chat_template_path]
+    print("Launching vLLM with command:", " ".join(cmd))
+    subprocess.Popen(" ".join(cmd), shell=True)
+@app.local_entrypoint()
+async def test() -> None:
+    """Local test entrypoint: health check + simple chat completion."""
+    import asyncio
+    import aiohttp
+    url = serve.get_web_url()
+    print(f"Test endpoint at {url}")
+    async with aiohttp.ClientSession(base_url=url) as session:
+        async with session.get("/health") as resp:
+            print("Health check status:", resp.status)
+            assert resp.status == 200, "Server health check failed"
+        messages = [
+            {"role": "system", "content": "You are a helpful network engineer."},
+            {"role": "user", "content": "What does OSPF do in a network?"},
+        ]
+        payload: dict[str, Any] = {"messages": messages, "model": ALIAS_NAME, "stream": False}
+        headers = {"Content-Type": "application/json"}
+        async with session.post("/v1/chat/completions", json=payload, headers=headers) as resp:
+            print("Chat completion status:", resp.status)
+            result = await resp.text()
+            print("Response:", result)
+        await asyncio.sleep(1)