Spaces:
Running
Running
Graham Paasch
commited on
Commit
·
6b6e30e
1
Parent(s):
b64e5d2
Add Modal vLLM deployment script for network model
Browse files- network_llm_modal.py +129 -0
network_llm_modal.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Modal application to serve a custom network-engineering language model via
|
| 3 |
+
vLLM in OpenAI-compatible mode.
|
| 4 |
+
|
| 5 |
+
Deploy with:
|
| 6 |
+
modal deploy network_llm_modal.py
|
| 7 |
+
|
| 8 |
+
Then call it with:
|
| 9 |
+
base_url = "https://<your-endpoint>/v1"
|
| 10 |
+
model = "network-llm"
|
| 11 |
+
api_key = <Modal API token>
|
| 12 |
+
|
| 13 |
+
This script:
|
| 14 |
+
- Builds an image with vLLM and bitsandbytes.
|
| 15 |
+
- Caches Hugging Face downloads to speed cold starts.
|
| 16 |
+
- Starts vLLM with a chat template and reduced context window to fit a
|
| 17 |
+
single H100.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from typing import Any
|
| 21 |
+
import subprocess
|
| 22 |
+
import tempfile
|
| 23 |
+
|
| 24 |
+
import modal
|
| 25 |
+
|
| 26 |
+
# Model and serving configuration
|
| 27 |
+
MODEL_NAME = "Irfanuruchi/Llama-3.1-8B-Computer-Networks-LLM"
|
| 28 |
+
ALIAS_NAME = "network-llm"
|
| 29 |
+
FAST_BOOT = True # use eager mode for faster cold starts
|
| 30 |
+
N_GPU = 1
|
| 31 |
+
VLLM_PORT = 8000
|
| 32 |
+
MAX_MODEL_LEN = 8192 # keep KV cache within a single H100
|
| 33 |
+
GPU_MEMORY_UTILIZATION = 0.90
|
| 34 |
+
|
| 35 |
+
# Simple chat template for vLLM (Llama-style headers)
|
| 36 |
+
CHAT_TEMPLATE = """{% for m in messages -%}
|
| 37 |
+
{{ '<|start_header_id|>' + m['role'] + '<|end_header_id|>\n\n' + m['content'] + '<|eot_id|>' }}
|
| 38 |
+
{% endfor -%}
|
| 39 |
+
{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"""
|
| 40 |
+
|
| 41 |
+
# Caches for model weights and vLLM artifacts
|
| 42 |
+
hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
|
| 43 |
+
vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
|
| 44 |
+
|
| 45 |
+
# Build image with vLLM + bitsandbytes
|
| 46 |
+
vllm_image = (
|
| 47 |
+
modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.12")
|
| 48 |
+
.entrypoint([])
|
| 49 |
+
.uv_pip_install(
|
| 50 |
+
"vllm==0.11.2",
|
| 51 |
+
"huggingface-hub==0.36.0",
|
| 52 |
+
"flashinfer-python==0.5.2",
|
| 53 |
+
"bitsandbytes>=0.46.1",
|
| 54 |
+
)
|
| 55 |
+
.env({"HF_XET_HIGH_PERFORMANCE": "1"})
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
app = modal.App("network-llm-modal")
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
@app.function(
|
| 62 |
+
image=vllm_image,
|
| 63 |
+
gpu=f"H100:{N_GPU}",
|
| 64 |
+
scaledown_window=15 * 60,
|
| 65 |
+
timeout=10 * 60,
|
| 66 |
+
volumes={
|
| 67 |
+
"/root/.cache/huggingface": hf_cache_vol,
|
| 68 |
+
"/root/.cache/vllm": vllm_cache_vol,
|
| 69 |
+
},
|
| 70 |
+
)
|
| 71 |
+
@modal.concurrent(max_inputs=32)
|
| 72 |
+
@modal.web_server(port=VLLM_PORT, startup_timeout=10 * 60)
|
| 73 |
+
def serve() -> None:
|
| 74 |
+
"""Launch a vLLM server inside the container."""
|
| 75 |
+
cmd: list[str] = [
|
| 76 |
+
"vllm",
|
| 77 |
+
"serve",
|
| 78 |
+
MODEL_NAME,
|
| 79 |
+
"--served-model-name",
|
| 80 |
+
ALIAS_NAME,
|
| 81 |
+
"--host",
|
| 82 |
+
"0.0.0.0",
|
| 83 |
+
"--port",
|
| 84 |
+
str(VLLM_PORT),
|
| 85 |
+
]
|
| 86 |
+
|
| 87 |
+
# Startup/throughput tradeoff
|
| 88 |
+
cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"]
|
| 89 |
+
cmd += ["--tensor-parallel-size", str(N_GPU)]
|
| 90 |
+
cmd += ["--max-model-len", str(MAX_MODEL_LEN)]
|
| 91 |
+
cmd += ["--gpu-memory-utilization", str(GPU_MEMORY_UTILIZATION)]
|
| 92 |
+
|
| 93 |
+
# Write chat template to a temp file and point vLLM at it
|
| 94 |
+
with tempfile.NamedTemporaryFile(mode="w", delete=False) as tf:
|
| 95 |
+
tf.write(CHAT_TEMPLATE)
|
| 96 |
+
chat_template_path = tf.name
|
| 97 |
+
cmd += ["--chat-template", chat_template_path]
|
| 98 |
+
|
| 99 |
+
print("Launching vLLM with command:", " ".join(cmd))
|
| 100 |
+
subprocess.Popen(" ".join(cmd), shell=True)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
@app.local_entrypoint()
|
| 104 |
+
async def test() -> None:
|
| 105 |
+
"""Local test entrypoint: health check + simple chat completion."""
|
| 106 |
+
import asyncio
|
| 107 |
+
import aiohttp
|
| 108 |
+
|
| 109 |
+
url = serve.get_web_url()
|
| 110 |
+
print(f"Test endpoint at {url}")
|
| 111 |
+
|
| 112 |
+
async with aiohttp.ClientSession(base_url=url) as session:
|
| 113 |
+
async with session.get("/health") as resp:
|
| 114 |
+
print("Health check status:", resp.status)
|
| 115 |
+
assert resp.status == 200, "Server health check failed"
|
| 116 |
+
|
| 117 |
+
messages = [
|
| 118 |
+
{"role": "system", "content": "You are a helpful network engineer."},
|
| 119 |
+
{"role": "user", "content": "What does OSPF do in a network?"},
|
| 120 |
+
]
|
| 121 |
+
payload: dict[str, Any] = {"messages": messages, "model": ALIAS_NAME, "stream": False}
|
| 122 |
+
headers = {"Content-Type": "application/json"}
|
| 123 |
+
|
| 124 |
+
async with session.post("/v1/chat/completions", json=payload, headers=headers) as resp:
|
| 125 |
+
print("Chat completion status:", resp.status)
|
| 126 |
+
result = await resp.text()
|
| 127 |
+
print("Response:", result)
|
| 128 |
+
|
| 129 |
+
await asyncio.sleep(1)
|