Graham Paasch commited on
Commit
6b6e30e
·
1 Parent(s): b64e5d2

Add Modal vLLM deployment script for network model

Browse files
Files changed (1) hide show
  1. network_llm_modal.py +129 -0
network_llm_modal.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Modal application to serve a custom network-engineering language model via
3
+ vLLM in OpenAI-compatible mode.
4
+
5
+ Deploy with:
6
+ modal deploy network_llm_modal.py
7
+
8
+ Then call it with:
9
+ base_url = "https://<your-endpoint>/v1"
10
+ model = "network-llm"
11
+ api_key = <Modal API token>
12
+
13
+ This script:
14
+ - Builds an image with vLLM and bitsandbytes.
15
+ - Caches Hugging Face downloads to speed cold starts.
16
+ - Starts vLLM with a chat template and reduced context window to fit a
17
+ single H100.
18
+ """
19
+
20
+ from typing import Any
21
+ import subprocess
22
+ import tempfile
23
+
24
+ import modal
25
+
26
+ # Model and serving configuration
27
+ MODEL_NAME = "Irfanuruchi/Llama-3.1-8B-Computer-Networks-LLM"
28
+ ALIAS_NAME = "network-llm"
29
+ FAST_BOOT = True # use eager mode for faster cold starts
30
+ N_GPU = 1
31
+ VLLM_PORT = 8000
32
+ MAX_MODEL_LEN = 8192 # keep KV cache within a single H100
33
+ GPU_MEMORY_UTILIZATION = 0.90
34
+
35
+ # Simple chat template for vLLM (Llama-style headers)
36
+ CHAT_TEMPLATE = """{% for m in messages -%}
37
+ {{ '<|start_header_id|>' + m['role'] + '<|end_header_id|>\n\n' + m['content'] + '<|eot_id|>' }}
38
+ {% endfor -%}
39
+ {{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"""
40
+
41
+ # Caches for model weights and vLLM artifacts
42
+ hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
43
+ vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
44
+
45
+ # Build image with vLLM + bitsandbytes
46
+ vllm_image = (
47
+ modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.12")
48
+ .entrypoint([])
49
+ .uv_pip_install(
50
+ "vllm==0.11.2",
51
+ "huggingface-hub==0.36.0",
52
+ "flashinfer-python==0.5.2",
53
+ "bitsandbytes>=0.46.1",
54
+ )
55
+ .env({"HF_XET_HIGH_PERFORMANCE": "1"})
56
+ )
57
+
58
+ app = modal.App("network-llm-modal")
59
+
60
+
61
+ @app.function(
62
+ image=vllm_image,
63
+ gpu=f"H100:{N_GPU}",
64
+ scaledown_window=15 * 60,
65
+ timeout=10 * 60,
66
+ volumes={
67
+ "/root/.cache/huggingface": hf_cache_vol,
68
+ "/root/.cache/vllm": vllm_cache_vol,
69
+ },
70
+ )
71
+ @modal.concurrent(max_inputs=32)
72
+ @modal.web_server(port=VLLM_PORT, startup_timeout=10 * 60)
73
+ def serve() -> None:
74
+ """Launch a vLLM server inside the container."""
75
+ cmd: list[str] = [
76
+ "vllm",
77
+ "serve",
78
+ MODEL_NAME,
79
+ "--served-model-name",
80
+ ALIAS_NAME,
81
+ "--host",
82
+ "0.0.0.0",
83
+ "--port",
84
+ str(VLLM_PORT),
85
+ ]
86
+
87
+ # Startup/throughput tradeoff
88
+ cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"]
89
+ cmd += ["--tensor-parallel-size", str(N_GPU)]
90
+ cmd += ["--max-model-len", str(MAX_MODEL_LEN)]
91
+ cmd += ["--gpu-memory-utilization", str(GPU_MEMORY_UTILIZATION)]
92
+
93
+ # Write chat template to a temp file and point vLLM at it
94
+ with tempfile.NamedTemporaryFile(mode="w", delete=False) as tf:
95
+ tf.write(CHAT_TEMPLATE)
96
+ chat_template_path = tf.name
97
+ cmd += ["--chat-template", chat_template_path]
98
+
99
+ print("Launching vLLM with command:", " ".join(cmd))
100
+ subprocess.Popen(" ".join(cmd), shell=True)
101
+
102
+
103
+ @app.local_entrypoint()
104
+ async def test() -> None:
105
+ """Local test entrypoint: health check + simple chat completion."""
106
+ import asyncio
107
+ import aiohttp
108
+
109
+ url = serve.get_web_url()
110
+ print(f"Test endpoint at {url}")
111
+
112
+ async with aiohttp.ClientSession(base_url=url) as session:
113
+ async with session.get("/health") as resp:
114
+ print("Health check status:", resp.status)
115
+ assert resp.status == 200, "Server health check failed"
116
+
117
+ messages = [
118
+ {"role": "system", "content": "You are a helpful network engineer."},
119
+ {"role": "user", "content": "What does OSPF do in a network?"},
120
+ ]
121
+ payload: dict[str, Any] = {"messages": messages, "model": ALIAS_NAME, "stream": False}
122
+ headers = {"Content-Type": "application/json"}
123
+
124
+ async with session.post("/v1/chat/completions", json=payload, headers=headers) as resp:
125
+ print("Chat completion status:", resp.status)
126
+ result = await resp.text()
127
+ print("Response:", result)
128
+
129
+ await asyncio.sleep(1)