Spaces:
Running
Running
| """ | |
| Modal application to serve a custom network-engineering language model via | |
| vLLM in OpenAI-compatible mode. | |
| Deploy with: | |
| modal deploy network_llm_modal.py | |
| Then call it with: | |
| base_url = "https://<your-endpoint>/v1" | |
| model = "network-llm" | |
| api_key = <Modal API token> | |
| This script: | |
| - Builds an image with vLLM and bitsandbytes. | |
| - Caches Hugging Face downloads to speed cold starts. | |
| - Starts vLLM with a chat template and reduced context window to fit a | |
| single H100. | |
| """ | |
| from typing import Any | |
| import subprocess | |
| import tempfile | |
| import modal | |
| # Model and serving configuration | |
| MODEL_NAME = "Irfanuruchi/Llama-3.1-8B-Computer-Networks-LLM" | |
| ALIAS_NAME = "network-llm" | |
| FAST_BOOT = True # use eager mode for faster cold starts | |
| N_GPU = 1 | |
| VLLM_PORT = 8000 | |
| MAX_MODEL_LEN = 8192 # keep KV cache within a single H100 | |
| GPU_MEMORY_UTILIZATION = 0.90 | |
| # Simple chat template for vLLM (Llama-style headers) | |
| CHAT_TEMPLATE = """{% for m in messages -%} | |
| {{ '<|start_header_id|>' + m['role'] + '<|end_header_id|>\n\n' + m['content'] + '<|eot_id|>' }} | |
| {% endfor -%} | |
| {{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}""" | |
| # Caches for model weights and vLLM artifacts | |
| hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True) | |
| vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True) | |
| # Build image with vLLM + bitsandbytes | |
| vllm_image = ( | |
| modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.12") | |
| .entrypoint([]) | |
| .uv_pip_install( | |
| "vllm==0.11.2", | |
| "huggingface-hub==0.36.0", | |
| "flashinfer-python==0.5.2", | |
| "bitsandbytes>=0.46.1", | |
| ) | |
| .env({"HF_XET_HIGH_PERFORMANCE": "1"}) | |
| ) | |
| app = modal.App("network-llm-modal") | |
| def serve() -> None: | |
| """Launch a vLLM server inside the container.""" | |
| cmd: list[str] = [ | |
| "vllm", | |
| "serve", | |
| MODEL_NAME, | |
| "--served-model-name", | |
| ALIAS_NAME, | |
| "--host", | |
| "0.0.0.0", | |
| "--port", | |
| str(VLLM_PORT), | |
| ] | |
| # Startup/throughput tradeoff | |
| cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"] | |
| cmd += ["--tensor-parallel-size", str(N_GPU)] | |
| cmd += ["--max-model-len", str(MAX_MODEL_LEN)] | |
| cmd += ["--gpu-memory-utilization", str(GPU_MEMORY_UTILIZATION)] | |
| # Write chat template to a temp file and point vLLM at it | |
| with tempfile.NamedTemporaryFile(mode="w", delete=False) as tf: | |
| tf.write(CHAT_TEMPLATE) | |
| chat_template_path = tf.name | |
| cmd += ["--chat-template", chat_template_path] | |
| print("Launching vLLM with command:", " ".join(cmd)) | |
| subprocess.Popen(" ".join(cmd), shell=True) | |
| async def test() -> None: | |
| """Local test entrypoint: health check + simple chat completion.""" | |
| import asyncio | |
| import aiohttp | |
| url = serve.get_web_url() | |
| print(f"Test endpoint at {url}") | |
| async with aiohttp.ClientSession(base_url=url) as session: | |
| async with session.get("/health") as resp: | |
| print("Health check status:", resp.status) | |
| assert resp.status == 200, "Server health check failed" | |
| messages = [ | |
| {"role": "system", "content": "You are a helpful network engineer."}, | |
| {"role": "user", "content": "What does OSPF do in a network?"}, | |
| ] | |
| payload: dict[str, Any] = {"messages": messages, "model": ALIAS_NAME, "stream": False} | |
| headers = {"Content-Type": "application/json"} | |
| async with session.post("/v1/chat/completions", json=payload, headers=headers) as resp: | |
| print("Chat completion status:", resp.status) | |
| result = await resp.text() | |
| print("Response:", result) | |
| await asyncio.sleep(1) | |