Spaces:

dks1
/

tempoPFN

Sleeping

altpuppet Claude commited on 29 days ago

Commit

c87b856

1 Parent(s): 426ae1a

Separate GPU inference from post-processing to fix ZeroGPU 404 error

- Created dedicated run_gpu_inference() function with @spaces.GPU decorator
- GPU function returns only simple serializable types (numpy arrays, list)
- Main forecast_time_series() handles data loading and post-processing
- Prevents complex objects (Plotly figures, DataFrames, dicts) from crossing GPU boundary
- This matches the pattern from the original working implementation

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <[email protected]>

Files changed (1) hide show

app.py +51 -36

app.py CHANGED Viewed

@@ -459,6 +459,50 @@ def create_gradio_app():
         return metrics
     @spaces.GPU
     def forecast_time_series(data_source, stock_ticker, uploaded_file, forecast_horizon, history_length, seed, synth_generator="Sine Waves", synth_complexity=5):
         """
         Runs the TempoPFN forecast.
@@ -631,49 +675,20 @@ def create_gradio_app():
             history_values_tensor = values_tensor[:, :-future_length, :]
             future_values_tensor = values_tensor[:, -future_length:, :]
-            # Ensure start is np.datetime64 and wrap in list
             if not isinstance(start, np.datetime64):
                 start = np.datetime64(start)
-            # Load model if needed (GPU-only app)
-            global model, device
-            if model is None:
-                print("--- Loading TempoPFN model for the first time ---")
-                device = torch.device("cuda:0")
-                print(f"Downloading model...")
-                model_path = hf_hub_download(repo_id="AutoML-org/TempoPFN", filename="models/checkpoint_38M.pth")
-                print(f"Loading model from {model_path} to {device}...")
-                model = load_model(config_path="configs/example.yaml", model_path=model_path, device=device)
-                print("--- Model loaded successfully ---")
-            # Prepare container and run inference
-            container = BatchTimeSeriesContainer(
-                history_values=history_values_tensor.to(device),
-                future_values=future_values_tensor.to(device),
-                start=[start],  # List with single np.datetime64 element
-                frequency=[freq_object],  # List with single Frequency enum element
             )
-            # Run inference with autocast
-            with torch.no_grad(), torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True):
-                model_output = model(container)
-            # Post-process predictions
-            preds_full = model_output["result"].to(torch.float32)
-            if hasattr(model, "scaler") and "scale_statistics" in model_output:
-                preds_full = model.scaler.inverse_scale(preds_full, model_output["scale_statistics"])
-            preds_full_cpu = preds_full.detach().cpu()
-            preds_np = preds_full_cpu.numpy()
-            history_np = history_values_tensor.squeeze(0).numpy()
-            future_np = future_values_tensor.squeeze(0).numpy()
             preds_squeezed = preds_np.squeeze(0)
-            # Get model quantiles (if available)
-            model_quantiles = None
-            if model is not None and hasattr(model, "loss_type") and model.loss_type == "quantile":
-                model_quantiles = model.quantiles
             try:
                 forecast_plot = plot_multivariate_timeseries(
                     history_values=history_np,

         return metrics
     @spaces.GPU
+    def run_gpu_inference(history_values_tensor, future_values_tensor, start, freq_object):
+        """
+        GPU-only inference function. Returns only simple serializable types.
+        Returns: (history_np, future_np, preds_np, model_quantiles)
+        """
+        global model, device
+        if model is None:
+            print("--- Loading TempoPFN model for the first time ---")
+            device = torch.device("cuda:0")
+            print(f"Downloading model...")
+            model_path = hf_hub_download(repo_id="AutoML-org/TempoPFN", filename="models/checkpoint_38M.pth")
+            print(f"Loading model from {model_path} to {device}...")
+            model = load_model(config_path="configs/example.yaml", model_path=model_path, device=device)
+            print("--- Model loaded successfully ---")
+        # Prepare container
+        container = BatchTimeSeriesContainer(
+            history_values=history_values_tensor.to(device),
+            future_values=future_values_tensor.to(device),
+            start=[start],
+            frequency=[freq_object],
+        )
+        # Run inference with autocast
+        with torch.no_grad(), torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True):
+            model_output = model(container)
+        # Post-process predictions
+        preds_full = model_output["result"].to(torch.float32)
+        if hasattr(model, "scaler") and "scale_statistics" in model_output:
+            preds_full = model.scaler.inverse_scale(preds_full, model_output["scale_statistics"])
+        # Convert to numpy arrays before returning (detach from GPU)
+        preds_np = preds_full.detach().cpu().numpy()
+        history_np = history_values_tensor.cpu().numpy()
+        future_np = future_values_tensor.cpu().numpy()
+        # Get model quantiles if available
+        model_quantiles = None
+        if hasattr(model, "loss_type") and model.loss_type == "quantile":
+            model_quantiles = model.quantiles
+        return history_np, future_np, preds_np, model_quantiles
     def forecast_time_series(data_source, stock_ticker, uploaded_file, forecast_horizon, history_length, seed, synth_generator="Sine Waves", synth_complexity=5):
         """
         Runs the TempoPFN forecast.
             history_values_tensor = values_tensor[:, :-future_length, :]
             future_values_tensor = values_tensor[:, -future_length:, :]
+            # Ensure start is np.datetime64
             if not isinstance(start, np.datetime64):
                 start = np.datetime64(start)
+            # Call GPU inference function (returns only simple types)
+            history_np, future_np, preds_np, model_quantiles = run_gpu_inference(
+                history_values_tensor, future_values_tensor, start, freq_object
             )
+            # Squeeze arrays for plotting
+            history_np = history_np.squeeze(0)
+            future_np = future_np.squeeze(0)
             preds_squeezed = preds_np.squeeze(0)
             try:
                 forecast_plot = plot_multivariate_timeseries(
                     history_values=history_np,