""" Modal application to serve a custom network-engineering language model via vLLM in OpenAI-compatible mode. Deploy with: modal deploy network_llm_modal.py Then call it with: base_url = "https:///v1" model = "network-llm" api_key = This script: - Builds an image with vLLM and bitsandbytes. - Caches Hugging Face downloads to speed cold starts. - Starts vLLM with a chat template and reduced context window to fit a single H100. """ from typing import Any import subprocess import tempfile import modal # Model and serving configuration MODEL_NAME = "Irfanuruchi/Llama-3.1-8B-Computer-Networks-LLM" ALIAS_NAME = "network-llm" FAST_BOOT = True # use eager mode for faster cold starts N_GPU = 1 VLLM_PORT = 8000 MAX_MODEL_LEN = 8192 # keep KV cache within a single H100 GPU_MEMORY_UTILIZATION = 0.90 # Simple chat template for vLLM (Llama-style headers) CHAT_TEMPLATE = """{% for m in messages -%} {{ '<|start_header_id|>' + m['role'] + '<|end_header_id|>\n\n' + m['content'] + '<|eot_id|>' }} {% endfor -%} {{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}""" # Caches for model weights and vLLM artifacts hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True) vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True) # Build image with vLLM + bitsandbytes vllm_image = ( modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.12") .entrypoint([]) .uv_pip_install( "vllm==0.11.2", "huggingface-hub==0.36.0", "flashinfer-python==0.5.2", "bitsandbytes>=0.46.1", ) .env({"HF_XET_HIGH_PERFORMANCE": "1"}) ) app = modal.App("network-llm-modal") @app.function( image=vllm_image, gpu=f"H100:{N_GPU}", scaledown_window=15 * 60, timeout=10 * 60, volumes={ "/root/.cache/huggingface": hf_cache_vol, "/root/.cache/vllm": vllm_cache_vol, }, ) @modal.concurrent(max_inputs=32) @modal.web_server(port=VLLM_PORT, startup_timeout=10 * 60) def serve() -> None: """Launch a vLLM server inside the container.""" cmd: list[str] = [ "vllm", "serve", MODEL_NAME, "--served-model-name", ALIAS_NAME, "--host", "0.0.0.0", "--port", str(VLLM_PORT), ] # Startup/throughput tradeoff cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"] cmd += ["--tensor-parallel-size", str(N_GPU)] cmd += ["--max-model-len", str(MAX_MODEL_LEN)] cmd += ["--gpu-memory-utilization", str(GPU_MEMORY_UTILIZATION)] # Write chat template to a temp file and point vLLM at it with tempfile.NamedTemporaryFile(mode="w", delete=False) as tf: tf.write(CHAT_TEMPLATE) chat_template_path = tf.name cmd += ["--chat-template", chat_template_path] print("Launching vLLM with command:", " ".join(cmd)) subprocess.Popen(" ".join(cmd), shell=True) @app.local_entrypoint() async def test() -> None: """Local test entrypoint: health check + simple chat completion.""" import asyncio import aiohttp url = serve.get_web_url() print(f"Test endpoint at {url}") async with aiohttp.ClientSession(base_url=url) as session: async with session.get("/health") as resp: print("Health check status:", resp.status) assert resp.status == 200, "Server health check failed" messages = [ {"role": "system", "content": "You are a helpful network engineer."}, {"role": "user", "content": "What does OSPF do in a network?"}, ] payload: dict[str, Any] = {"messages": messages, "model": ALIAS_NAME, "stream": False} headers = {"Content-Type": "application/json"} async with session.post("/v1/chat/completions", json=payload, headers=headers) as resp: print("Chat completion status:", resp.status) result = await resp.text() print("Response:", result) await asyncio.sleep(1)