overgrowth / network_llm_modal.py
Graham Paasch
Add Modal vLLM deployment script for network model
6b6e30e
"""
Modal application to serve a custom network-engineering language model via
vLLM in OpenAI-compatible mode.
Deploy with:
modal deploy network_llm_modal.py
Then call it with:
base_url = "https://<your-endpoint>/v1"
model = "network-llm"
api_key = <Modal API token>
This script:
- Builds an image with vLLM and bitsandbytes.
- Caches Hugging Face downloads to speed cold starts.
- Starts vLLM with a chat template and reduced context window to fit a
single H100.
"""
from typing import Any
import subprocess
import tempfile
import modal
# Model and serving configuration
MODEL_NAME = "Irfanuruchi/Llama-3.1-8B-Computer-Networks-LLM"
ALIAS_NAME = "network-llm"
FAST_BOOT = True # use eager mode for faster cold starts
N_GPU = 1
VLLM_PORT = 8000
MAX_MODEL_LEN = 8192 # keep KV cache within a single H100
GPU_MEMORY_UTILIZATION = 0.90
# Simple chat template for vLLM (Llama-style headers)
CHAT_TEMPLATE = """{% for m in messages -%}
{{ '<|start_header_id|>' + m['role'] + '<|end_header_id|>\n\n' + m['content'] + '<|eot_id|>' }}
{% endfor -%}
{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"""
# Caches for model weights and vLLM artifacts
hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
# Build image with vLLM + bitsandbytes
vllm_image = (
modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.12")
.entrypoint([])
.uv_pip_install(
"vllm==0.11.2",
"huggingface-hub==0.36.0",
"flashinfer-python==0.5.2",
"bitsandbytes>=0.46.1",
)
.env({"HF_XET_HIGH_PERFORMANCE": "1"})
)
app = modal.App("network-llm-modal")
@app.function(
image=vllm_image,
gpu=f"H100:{N_GPU}",
scaledown_window=15 * 60,
timeout=10 * 60,
volumes={
"/root/.cache/huggingface": hf_cache_vol,
"/root/.cache/vllm": vllm_cache_vol,
},
)
@modal.concurrent(max_inputs=32)
@modal.web_server(port=VLLM_PORT, startup_timeout=10 * 60)
def serve() -> None:
"""Launch a vLLM server inside the container."""
cmd: list[str] = [
"vllm",
"serve",
MODEL_NAME,
"--served-model-name",
ALIAS_NAME,
"--host",
"0.0.0.0",
"--port",
str(VLLM_PORT),
]
# Startup/throughput tradeoff
cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"]
cmd += ["--tensor-parallel-size", str(N_GPU)]
cmd += ["--max-model-len", str(MAX_MODEL_LEN)]
cmd += ["--gpu-memory-utilization", str(GPU_MEMORY_UTILIZATION)]
# Write chat template to a temp file and point vLLM at it
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tf:
tf.write(CHAT_TEMPLATE)
chat_template_path = tf.name
cmd += ["--chat-template", chat_template_path]
print("Launching vLLM with command:", " ".join(cmd))
subprocess.Popen(" ".join(cmd), shell=True)
@app.local_entrypoint()
async def test() -> None:
"""Local test entrypoint: health check + simple chat completion."""
import asyncio
import aiohttp
url = serve.get_web_url()
print(f"Test endpoint at {url}")
async with aiohttp.ClientSession(base_url=url) as session:
async with session.get("/health") as resp:
print("Health check status:", resp.status)
assert resp.status == 200, "Server health check failed"
messages = [
{"role": "system", "content": "You are a helpful network engineer."},
{"role": "user", "content": "What does OSPF do in a network?"},
]
payload: dict[str, Any] = {"messages": messages, "model": ALIAS_NAME, "stream": False}
headers = {"Content-Type": "application/json"}
async with session.post("/v1/chat/completions", json=payload, headers=headers) as resp:
print("Chat completion status:", resp.status)
result = await resp.text()
print("Response:", result)
await asyncio.sleep(1)