Update inference endpoint handler (20250927-165107)

Browse files

Files changed (3) hide show

README.md +25 -0
handler.py +418 -0
requirements.txt +10 -0

README.md ADDED Viewed

	@@ -0,0 +1,25 @@

+# Qwen Omni Hugging Face Inference Endpoint Handler
+This directory contains a reusable custom handler for deploying Qwen 3 Omni models
+(via the Hugging Face Inference Endpoints service). The handler mirrors the
+multi-modal interaction blueprint from the official Qwen audio/visual dialogue
+cookbook and supports text, image, and audio turns in a single payload.
+## Files
+* `handler.py` – entry-point loaded by the Inference Endpoint runtime.
+* `requirements.txt` – Python dependencies installed before the handler is imported.
+## Usage
+1. Upload the contents of this directory (`handler.py`, `requirements.txt`) to a
+   Hugging Face model repository that you control (defaults to
+   `GrandMasterPomidor/qwen-omni-endpoint-handler` via the provided Makefile).
+2. Provision a custom Inference Endpoint that references that repository and the
+   Qwen Omni model weights you wish to serve. Set environment variables such as
+   `MODEL_ID` to point at your chosen checkpoint (e.g. `Qwen/Qwen2.5-Omni-Mini`).
+3. Send JSON payloads to the endpoint as documented in the header docstring of
+   `handler.py`.
+Refer to the accompanying `Makefile` for convenience targets to package and
+push these assets.

handler.py ADDED Viewed

	@@ -0,0 +1,418 @@

+"""Custom Hugging Face Inference Endpoints handler for Qwen Omni models.
+This handler is designed for multi-modal dialogue with the Qwen3 Omni models,
+following the audio/visual dialogue cookbook in the Qwen repository. It loads
+an Omni chat model, accepts mixed text, image, and audio content, and returns
+an assistant reply that can be fed into subsequent turns.
+Expected request payload structure (JSON):
+{
+  "inputs": {
+    "messages": [
+      {
+        "role": "user",
+        "content": [
+          {"type": "text", "text": "Describe the picture"},
+          {"type": "image", "image_url": "https://.../photo.jpg"},
+          {"type": "audio", "audio_url": "https://.../clip.wav"}
+        ]
+      }
+    ]
+  },
+  "parameters": {
+    "max_new_tokens": 256,
+    "temperature": 0.7,
+    "top_p": 0.9
+  }
+}
+Supported content variants:
+* Text: provide "text" or "value".
+* Image: provide one of "image" (base64 string with optional data URI),
+  "image_url" (HTTP(S) URL), or "image_path" (path within the repository).
+* Audio: provide either
+    - "audio"/"array" with float samples plus "sampling_rate" (Hz), or
+    - base64 data under "audio"/"audio_b64", or
+    - remote/local path via "audio_url"/"audio_path".
+Environment variables:
+* MODEL_ID (defaults to Qwen/Qwen3-Omni-30B-A3B-Instruct) – Hugging Face model repo.
+* DEVICE (defaults to cuda if available else cpu) – Inference device override.
+* DEVICE_MAP (defaults to auto when GPU available) – Passed to from_pretrained.
+* TORCH_DTYPE (defaults to bfloat16 on GPU, float32 on CPU) – torch dtype name.
+* MAX_NEW_TOKENS, TEMPERATURE, TOP_P, TOP_K, DO_SAMPLE – override defaults.
+Returned payload:
+{
+  "generated_text": "...assistant reply...",
+  "messages": [...messages augmented with assistant turn...],
+  "generation_kwargs": {...actual generation settings used...}
+}
+"""
+from __future__ import annotations
+import base64
+import io
+import json
+import os
+from dataclasses import dataclass
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+import numpy as np
+import torch
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
+try:
+    import requests
+except ImportError:  # pragma: no cover - requests is available on endpoints but guard just in case
+    requests = None  # type: ignore
+@dataclass
+class AudioPayload:
+    """Container for audio samples consumed by the Omni processor."""
+    array: np.ndarray
+    sampling_rate: int
+    def as_processor_input(self) -> Dict[str, Any]:
+        return {
+            "array": self.array.astype(np.float32),
+            "sampling_rate": int(self.sampling_rate),
+        }
+class EndpointHandler:
+    """Hugging Face custom handler compatible with multi-modal Qwen Omni models."""
+    def __init__(self, path: str = "") -> None:
+        model_id = os.getenv(
+            "MODEL_ID") or path or "Qwen/Qwen3-Omni-30B-A3B-Instruct"
+        device_hint = os.getenv("DEVICE")
+        self.device = device_hint or (
+            "cuda" if torch.cuda.is_available() else "cpu")
+        dtype_name = os.getenv(
+            "TORCH_DTYPE",
+            "bfloat16" if self.device.startswith("cuda") else "float32",
+        )
+        torch_dtype = getattr(torch, dtype_name, None)
+        if torch_dtype is None:
+            raise ValueError(f"Unsupported TORCH_DTYPE value: {dtype_name}")
+        model_kwargs: Dict[str, Any] = {
+            "trust_remote_code": True,
+            "torch_dtype": torch_dtype,
+        }
+        device_map_env = os.getenv("DEVICE_MAP")
+        if device_map_env:
+            model_kwargs["device_map"] = device_map_env
+        elif self.device != "cpu":
+            model_kwargs["device_map"] = "auto"
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_id, **model_kwargs)
+        if model_kwargs.get("device_map") is None:
+            self.model.to(self.device)
+        self.processor = AutoProcessor.from_pretrained(
+            model_id, trust_remote_code=True)
+        try:
+            generation_config = GenerationConfig.from_pretrained(model_id)
+        except Exception:  # pragma: no cover - not all repos ship a config
+            generation_config = self.model.generation_config
+        self.base_generation_kwargs = self._extract_generation_kwargs(
+            generation_config)
+    # ---------------------------------------------------------------------
+    # Public API
+    # ---------------------------------------------------------------------
+    def __call__(self, data: Dict[str, Any], *args: Any, **kwargs: Any) -> Dict[str, Any]:
+        if not data:
+            raise ValueError("Empty payload received by handler")
+        payload = data.get("inputs") if isinstance(data, dict) else data
+        parameters = data.get("parameters", {}) if isinstance(
+            data, dict) else {}
+        messages = self._normalize_messages(payload)
+        processed_messages, images, audios = self._prepare_messages(messages)
+        chat_template = self.processor.apply_chat_template(
+            processed_messages,
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        model_inputs = self.processor(
+            text=chat_template,
+            images=[img for img in images] if images else None,
+            audios=[aud.as_processor_input()
+                    for aud in audios] if audios else None,
+            return_tensors="pt",
+        )
+        if hasattr(model_inputs, "to"):
+            model_inputs = model_inputs.to(self.model.device if hasattr(
+                self.model, "device") else self.device)
+        else:
+            model_inputs = {
+                k: v.to(self.model.device if hasattr(
+                    self.model, "device") else self.device)
+                for k, v in model_inputs.items()
+            }
+        generation_kwargs = {**self.base_generation_kwargs, **parameters}
+        generation_kwargs.setdefault("return_dict_in_generate", True)
+        generation_kwargs.setdefault("output_scores", False)
+        with torch.inference_mode():
+            outputs = self.model.generate(**model_inputs, **generation_kwargs)
+        sequences = outputs.sequences if hasattr(
+            outputs, "sequences") else outputs
+        input_length = model_inputs["input_ids"].shape[-1]
+        generated_ids = sequences[:, input_length:]
+        generated_text = self.processor.batch_decode(
+            generated_ids,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=True,
+        )[0].strip()
+        augmented_messages = list(messages) + [
+            {
+                "role": "assistant",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": generated_text,
+                    }
+                ],
+            }
+        ]
+        return {
+            "generated_text": generated_text,
+            "messages": augmented_messages,
+            "generation_kwargs": generation_kwargs,
+        }
+    # ------------------------------------------------------------------
+    # Helpers
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _extract_generation_kwargs(config: GenerationConfig) -> Dict[str, Any]:
+        defaults = {
+            "max_new_tokens": getattr(config, "max_new_tokens", 512),
+            "temperature": getattr(config, "temperature", 0.7),
+            "top_p": getattr(config, "top_p", 0.9),
+            "top_k": getattr(config, "top_k", None),
+            "do_sample": getattr(config, "do_sample", True),
+        }
+        env_overrides = {
+            "max_new_tokens": os.getenv("MAX_NEW_TOKENS"),
+            "temperature": os.getenv("TEMPERATURE"),
+            "top_p": os.getenv("TOP_P"),
+            "top_k": os.getenv("TOP_K"),
+            "do_sample": os.getenv("DO_SAMPLE"),
+        }
+        for key, value in env_overrides.items():
+            if value is None:
+                continue
+            if key == "do_sample":
+                defaults[key] = value.lower() == "true"
+            elif key == "max_new_tokens" or key == "top_k":
+                defaults[key] = int(value)
+            else:
+                defaults[key] = float(value)
+        return {k: v for k, v in defaults.items() if v is not None}
+    @staticmethod
+    def _normalize_messages(payload: Any) -> List[Dict[str, Any]]:
+        if isinstance(payload, str):
+            return [
+                {
+                    "role": "user",
+                    "content": [{"type": "text", "text": payload}],
+                }
+            ]
+        if isinstance(payload, dict) and "messages" in payload:
+            return payload["messages"]
+        if isinstance(payload, dict):
+            text_value = payload.get("prompt") or payload.get("text")
+            if text_value:
+                return [
+                    {
+                        "role": payload.get("role", "user"),
+                        "content": [{"type": "text", "text": text_value}],
+                    }
+                ]
+        raise ValueError(
+            "Unsupported input format. Provide `inputs.messages` or a raw text prompt.")
+    def _prepare_messages(
+        self, messages: Iterable[Dict[str, Any]]
+    ) -> Tuple[List[Dict[str, Any]], List[Image.Image], List[AudioPayload]]:
+        processed_messages: List[Dict[str, Any]] = []
+        images: List[Image.Image] = []
+        audios: List[AudioPayload] = []
+        for message in messages:
+            role = message.get("role", "user")
+            raw_content = message.get("content")
+            if raw_content is None:
+                raise ValueError(f"Message without content: {message}")
+            if isinstance(raw_content, str):
+                raw_content = [{"type": "text", "text": raw_content}]
+            new_parts: List[Dict[str, Any]] = []
+            for part in raw_content:
+                part_type = part.get("type", "text")
+                if part_type == "text":
+                    text = part.get("text") or part.get("value")
+                    if text is None:
+                        raise ValueError(f"Missing text value in part: {part}")
+                    new_parts.append({"type": "text", "text": text})
+                elif part_type == "image":
+                    image = self._load_image(part)
+                    images.append(image)
+                    new_parts.append({"type": "image", "image": image})
+                elif part_type == "audio":
+                    audio_payload = self._load_audio(part)
+                    audios.append(audio_payload)
+                    new_parts.append(
+                        {"type": "audio", "audio": audio_payload.as_processor_input()})
+                else:
+                    raise ValueError(f"Unsupported content type: {part_type}")
+            processed_messages.append({"role": role, "content": new_parts})
+        return processed_messages, images, audios
+    # ------------------------------------------------------------------
+    # Loaders
+    # ------------------------------------------------------------------
+    def _load_image(self, part: Dict[str, Any]) -> Image.Image:
+        if "image" in part and isinstance(part["image"], Image.Image):
+            return part["image"]
+        if "image" in part and isinstance(part["image"], str):
+            return self._decode_image_string(part["image"])
+        if "image_b64" in part:
+            return self._decode_image_string(part["image_b64"])
+        if "image_path" in part:
+            return Image.open(part["image_path"]).convert("RGB")
+        if "image_url" in part:
+            data = self._fetch_remote(part["image_url"])
+            return Image.open(io.BytesIO(data)).convert("RGB")
+        raise ValueError(f"Cannot resolve image content from part: {part}")
+    def _load_audio(self, part: Dict[str, Any]) -> AudioPayload:
+        if "audio" in part and isinstance(part["audio"], dict) and "array" in part["audio"]:
+            array = np.asarray(part["audio"]["array"], dtype=np.float32)
+            sampling_rate = int(part["audio"].get(
+                "sampling_rate", part.get("sampling_rate", 16000)))
+            return AudioPayload(array=array, sampling_rate=sampling_rate)
+        if "array" in part:
+            array = np.asarray(part["array"], dtype=np.float32)
+            sampling_rate = int(part.get("sampling_rate", 16000))
+            return AudioPayload(array=array, sampling_rate=sampling_rate)
+        audio_bytes: Optional[bytes] = None
+        if "audio" in part and isinstance(part["audio"], str):
+            audio_bytes = self._maybe_read_bytes(part["audio"])
+        elif "audio_b64" in part:
+            audio_bytes = base64.b64decode(part["audio_b64"])
+        elif "audio_path" in part:
+            with open(part["audio_path"], "rb") as handle:
+                audio_bytes = handle.read()
+        elif "audio_url" in part:
+            audio_bytes = self._fetch_remote(part["audio_url"])
+        if audio_bytes is None:
+            raise ValueError(f"Cannot resolve audio content from part: {part}")
+        array, sampling_rate = self._decode_audio(audio_bytes)
+        return AudioPayload(array=array, sampling_rate=sampling_rate)
+    @staticmethod
+    def _decode_image_string(raw: str) -> Image.Image:
+        if raw.startswith("data:"):
+            raw = raw.split(",", 1)[1]
+        image_bytes = base64.b64decode(raw)
+        return Image.open(io.BytesIO(image_bytes)).convert("RGB")
+    @staticmethod
+    def _maybe_read_bytes(value: str) -> bytes:
+        if os.path.exists(value):
+            with open(value, "rb") as handle:
+                return handle.read()
+        try:
+            if value.startswith("data:"):
+                value = value.split(",", 1)[1]
+            return base64.b64decode(value)
+        except Exception as exc:
+            raise ValueError(
+                "Provide either a file path or base64-encoded audio for 'audio'.") from exc
+    @staticmethod
+    def _decode_audio(raw_bytes: bytes) -> Tuple[np.ndarray, int]:
+        # Try python-soundfile first, fall back to torchaudio if available.
+        try:
+            import soundfile as sf
+            array, sampling_rate = sf.read(io.BytesIO(raw_bytes))
+            if array.ndim > 1:
+                array = np.mean(array, axis=1)
+            return array.astype(np.float32), int(sampling_rate)
+        except Exception:
+            pass
+        try:
+            import torchaudio
+            waveform, sampling_rate = torchaudio.load(io.BytesIO(raw_bytes))
+            array = waveform.mean(dim=0).numpy()
+            return array.astype(np.float32), int(sampling_rate)
+        except Exception as exc:
+            raise RuntimeError(
+                "Unable to decode audio bytes. Install 'soundfile' or 'torchaudio' in requirements."
+            ) from exc
+    @staticmethod
+    def _fetch_remote(url: str) -> bytes:
+        if requests is None:
+            raise RuntimeError(
+                "requests is required to download remote resources")
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        return response.content
+if __name__ == "__main__":  # pragma: no cover - simple smoke test entry point
+    handler = EndpointHandler()
+    demo_payload = {
+        "inputs": {
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "Describe the image"},
+                    ],
+                }
+            ]
+        },
+        "parameters": {"max_new_tokens": 64},
+    }
+    response = handler(demo_payload)
+    print(json.dumps(response, indent=2))

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+# Dependencies for the Qwen Omni custom inference handler
+transformers>=4.43.0
+accelerate>=0.33.0
+torch>=2.2.0
+sentencepiece
+numpy>=1.24
+pillow>=10.0
+requests>=2.31
+soundfile>=0.12
+torchaudio>=2.2