Spaces:

MCP-1st-Birthday
/

overgrowth

Running

overgrowth / network_llm_modal.py

Graham Paasch

Add Modal vLLM deployment script for network model

6b6e30e 11 days ago

4.06 kB

	"""
	Modal application to serve a custom network-engineering language model via
	vLLM in OpenAI-compatible mode.

	Deploy with:
	modal deploy network_llm_modal.py

	Then call it with:
	base_url = "https://<your-endpoint>/v1"
	model = "network-llm"
	api_key = <Modal API token>

	This script:
	- Builds an image with vLLM and bitsandbytes.
	- Caches Hugging Face downloads to speed cold starts.
	- Starts vLLM with a chat template and reduced context window to fit a
	single H100.
	"""

	from typing import Any
	import subprocess
	import tempfile

	import modal

	# Model and serving configuration
	MODEL_NAME = "Irfanuruchi/Llama-3.1-8B-Computer-Networks-LLM"
	ALIAS_NAME = "network-llm"
	FAST_BOOT = True # use eager mode for faster cold starts
	N_GPU = 1
	VLLM_PORT = 8000
	MAX_MODEL_LEN = 8192 # keep KV cache within a single H100
	GPU_MEMORY_UTILIZATION = 0.90

	# Simple chat template for vLLM (Llama-style headers)
	CHAT_TEMPLATE = """{% for m in messages -%}
	{{ '<\|start_header_id\|>' + m['role'] + '<\|end_header_id\|>\n\n' + m['content'] + '<\|eot_id\|>' }}
	{% endfor -%}
	{{ '<\|start_header_id\|>assistant<\|end_header_id\|>\n\n' }}"""

	# Caches for model weights and vLLM artifacts
	hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
	vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)

	# Build image with vLLM + bitsandbytes
	vllm_image = (
	modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.12")
	.entrypoint([])
	.uv_pip_install(
	"vllm==0.11.2",
	"huggingface-hub==0.36.0",
	"flashinfer-python==0.5.2",
	"bitsandbytes>=0.46.1",
	)
	.env({"HF_XET_HIGH_PERFORMANCE": "1"})
	)

	app = modal.App("network-llm-modal")


	@app.function(
	image=vllm_image,
	gpu=f"H100:{N_GPU}",
	scaledown_window=15 * 60,
	timeout=10 * 60,
	volumes={
	"/root/.cache/huggingface": hf_cache_vol,
	"/root/.cache/vllm": vllm_cache_vol,
	},
	)
	@modal.concurrent(max_inputs=32)
	@modal.web_server(port=VLLM_PORT, startup_timeout=10 * 60)
	def serve() -> None:
	"""Launch a vLLM server inside the container."""
	cmd: list[str] = [
	"vllm",
	"serve",
	MODEL_NAME,
	"--served-model-name",
	ALIAS_NAME,
	"--host",
	"0.0.0.0",
	"--port",
	str(VLLM_PORT),
	]

	# Startup/throughput tradeoff
	cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"]
	cmd += ["--tensor-parallel-size", str(N_GPU)]
	cmd += ["--max-model-len", str(MAX_MODEL_LEN)]
	cmd += ["--gpu-memory-utilization", str(GPU_MEMORY_UTILIZATION)]

	# Write chat template to a temp file and point vLLM at it
	with tempfile.NamedTemporaryFile(mode="w", delete=False) as tf:
	tf.write(CHAT_TEMPLATE)
	chat_template_path = tf.name
	cmd += ["--chat-template", chat_template_path]

	print("Launching vLLM with command:", " ".join(cmd))
	subprocess.Popen(" ".join(cmd), shell=True)


	@app.local_entrypoint()
	async def test() -> None:
	"""Local test entrypoint: health check + simple chat completion."""
	import asyncio
	import aiohttp

	url = serve.get_web_url()
	print(f"Test endpoint at {url}")

	async with aiohttp.ClientSession(base_url=url) as session:
	async with session.get("/health") as resp:
	print("Health check status:", resp.status)
	assert resp.status == 200, "Server health check failed"

	messages = [
	{"role": "system", "content": "You are a helpful network engineer."},
	{"role": "user", "content": "What does OSPF do in a network?"},
	]
	payload: dict[str, Any] = {"messages": messages, "model": ALIAS_NAME, "stream": False}
	headers = {"Content-Type": "application/json"}

	async with session.post("/v1/chat/completions", json=payload, headers=headers) as resp:
	print("Chat completion status:", resp.status)
	result = await resp.text()
	print("Response:", result)

	await asyncio.sleep(1)