Spaces:

dks1
/

tempoPFN

Sleeping

altpuppet Claude commited on Nov 22, 2025

Commit

8e6f2eb

1 Parent(s): c87b856

Refactor to match paper's implementation pattern exactly

Key changes to align with examples/quick_start_tempo_pfn.py and examples/utils.py:

1. GPU function now takes container and returns model_output dict (not numpy arrays)
2. Container preparation happens OUTSIDE the GPU-decorated function
3. Post-processing (inverse scaling, CPU conversion) happens AFTER GPU call
4. Model stays loaded in global state between calls
5. Only the actual inference (model(container)) is GPU-decorated

This matches the exact pattern from the working reference implementation:
- examples/quick_start_tempo_pfn.py lines 66-75 (container prep outside)
- examples/utils.py lines 56-69 (inference with autocast, then post-process)

The key insight: ZeroGPU works best when the decorated function is minimal
and doesn't handle data preparation or complex post-processing.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <[email protected]>

Files changed (1) hide show

app.py +34 -34

app.py CHANGED Viewed

@@ -459,12 +459,14 @@ def create_gradio_app():
         return metrics
     @spaces.GPU
-    def run_gpu_inference(history_values_tensor, future_values_tensor, start, freq_object):
         """
-        GPU-only inference function. Returns only simple serializable types.
-        Returns: (history_np, future_np, preds_np, model_quantiles)
         """
         global model, device
         if model is None:
             print("--- Loading TempoPFN model for the first time ---")
             device = torch.device("cuda:0")
@@ -474,34 +476,11 @@ def create_gradio_app():
             model = load_model(config_path="configs/example.yaml", model_path=model_path, device=device)
             print("--- Model loaded successfully ---")
-        # Prepare container
-        container = BatchTimeSeriesContainer(
-            history_values=history_values_tensor.to(device),
-            future_values=future_values_tensor.to(device),
-            start=[start],
-            frequency=[freq_object],
-        )
-        # Run inference with autocast
         with torch.no_grad(), torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True):
             model_output = model(container)
-        # Post-process predictions
-        preds_full = model_output["result"].to(torch.float32)
-        if hasattr(model, "scaler") and "scale_statistics" in model_output:
-            preds_full = model.scaler.inverse_scale(preds_full, model_output["scale_statistics"])
-        # Convert to numpy arrays before returning (detach from GPU)
-        preds_np = preds_full.detach().cpu().numpy()
-        history_np = history_values_tensor.cpu().numpy()
-        future_np = future_values_tensor.cpu().numpy()
-        # Get model quantiles if available
-        model_quantiles = None
-        if hasattr(model, "loss_type") and model.loss_type == "quantile":
-            model_quantiles = model.quantiles
-        return history_np, future_np, preds_np, model_quantiles
     def forecast_time_series(data_source, stock_ticker, uploaded_file, forecast_horizon, history_length, seed, synth_generator="Sine Waves", synth_complexity=5):
         """
@@ -679,16 +658,37 @@ def create_gradio_app():
             if not isinstance(start, np.datetime64):
                 start = np.datetime64(start)
-            # Call GPU inference function (returns only simple types)
-            history_np, future_np, preds_np, model_quantiles = run_gpu_inference(
-                history_values_tensor, future_values_tensor, start, freq_object
             )
-            # Squeeze arrays for plotting
-            history_np = history_np.squeeze(0)
-            future_np = future_np.squeeze(0)
             preds_squeezed = preds_np.squeeze(0)
             try:
                 forecast_plot = plot_multivariate_timeseries(
                     history_values=history_np,

         return metrics
     @spaces.GPU
+    def run_gpu_inference(container):
         """
+        GPU-only inference function matching the paper's implementation.
+        Takes a container and returns model output dict.
         """
         global model, device
+        # Load model once on first call
         if model is None:
             print("--- Loading TempoPFN model for the first time ---")
             device = torch.device("cuda:0")
             model = load_model(config_path="configs/example.yaml", model_path=model_path, device=device)
             print("--- Model loaded successfully ---")
+        # Run inference with bfloat16 autocast (exactly like the paper's examples/utils.py)
         with torch.no_grad(), torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True):
             model_output = model(container)
+        return model_output
     def forecast_time_series(data_source, stock_ticker, uploaded_file, forecast_horizon, history_length, seed, synth_generator="Sine Waves", synth_complexity=5):
         """
             if not isinstance(start, np.datetime64):
                 start = np.datetime64(start)
+            # Prepare container (exactly like the paper's example)
+            global device
+            if device is None:
+                device = torch.device("cuda:0")
+            container = BatchTimeSeriesContainer(
+                history_values=history_values_tensor.to(device),
+                future_values=future_values_tensor.to(device),
+                start=[start],
+                frequency=[freq_object],
             )
+            # Run GPU inference (returns model_output dict)
+            model_output = run_gpu_inference(container)
+            # Post-process predictions (exactly like examples/utils.py lines 65-69)
+            preds_full = model_output["result"].to(torch.float32)
+            if model is not None and hasattr(model, "scaler") and "scale_statistics" in model_output:
+                preds_full = model.scaler.inverse_scale(preds_full, model_output["scale_statistics"])
+            # Convert to numpy for plotting
+            preds_np = preds_full.detach().cpu().numpy()
+            history_np = history_values_tensor.cpu().numpy().squeeze(0)
+            future_np = future_values_tensor.cpu().numpy().squeeze(0)
             preds_squeezed = preds_np.squeeze(0)
+            # Get model quantiles if available
+            model_quantiles = None
+            if model is not None and hasattr(model, "loss_type") and model.loss_type == "quantile":
+                model_quantiles = model.quantiles
             try:
                 forecast_plot = plot_multivariate_timeseries(
                     history_values=history_np,