File size: 8,517 Bytes

f679afb

"""

House Price Prediction Inference Module



This module provides a simple API for loading the trained California house price

prediction model and making predictions on new data.

"""

import joblib
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Union, Dict, List


class HousePricePredictor:
    """

    A predictor class for California house prices.

    

    This class loads a pre-trained Random Forest model and its preprocessing pipeline,

    and provides methods for making predictions on new housing data.

    """
    
    def __init__(self, model_path: str = "house_price_model.joblib", 

                 pipeline_path: str = "preprocessing_pipeline.joblib"):
        """

        Initialize the predictor by loading the model and preprocessing pipeline.

        

        Args:

            model_path: Path to the trained model joblib file

            pipeline_path: Path to the preprocessing pipeline joblib file

        """
        self.model_path = Path(model_path)
        self.pipeline_path = Path(pipeline_path)
        self.model = None
        self.pipeline = None
        self.feature_names = [
            'longitude', 'latitude', 'housing_median_age', 'total_rooms',
            'total_bedrooms', 'population', 'households', 'median_income',
            'ocean_proximity'
        ]
        self.valid_ocean_proximity = ['<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND']
        
    def load(self):
        """Load the model and preprocessing pipeline from disk."""
        if not self.model_path.exists():
            raise FileNotFoundError(f"Model file not found: {self.model_path}")
        if not self.pipeline_path.exists():
            raise FileNotFoundError(f"Pipeline file not found: {self.pipeline_path}")
            
        self.model = joblib.load(self.model_path)
        self.pipeline = joblib.load(self.pipeline_path)
        print(f"✅ Model loaded successfully from {self.model_path}")
        print(f"✅ Pipeline loaded successfully from {self.pipeline_path}")
        
    def validate_input(self, data: pd.DataFrame):
        """

        Validate that input data has all required features.

        

        Args:

            data: DataFrame with input features

            

        Raises:

            ValueError: If required features are missing or invalid

        """
        missing_features = set(self.feature_names) - set(data.columns)
        if missing_features:
            raise ValueError(f"Missing required features: {missing_features}")
        
        # Validate ocean_proximity values
        invalid_values = set(data['ocean_proximity'].unique()) - set(self.valid_ocean_proximity)
        if invalid_values:
            raise ValueError(
                f"Invalid ocean_proximity values: {invalid_values}. "
                f"Valid values are: {self.valid_ocean_proximity}"
            )
    
    def predict(self, data: Union[pd.DataFrame, Dict, List[Dict]]) -> np.ndarray:
        """

        Make predictions on input data.

        

        Args:

            data: Input data as DataFrame, single dict, or list of dicts.

                  Must contain all required features:

                  - longitude (float): Longitude coordinate

                  - latitude (float): Latitude coordinate

                  - housing_median_age (float): Median age of houses in the block

                  - total_rooms (float): Total number of rooms in the block

                  - total_bedrooms (float): Total number of bedrooms in the block

                  - population (float): Total population in the block

                  - households (float): Total number of households in the block

                  - median_income (float): Median income of households (in tens of thousands)

                  - ocean_proximity (str): Proximity to ocean, one of:

                    '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND'

        

        Returns:

            numpy array of predicted house prices (in dollars)

            

        Example:

            >>> predictor = HousePricePredictor()

            >>> predictor.load()

            >>> data = {

            ...     'longitude': -122.23,

            ...     'latitude': 37.88,

            ...     'housing_median_age': 41.0,

            ...     'total_rooms': 880.0,

            ...     'total_bedrooms': 129.0,

            ...     'population': 322.0,

            ...     'households': 126.0,

            ...     'median_income': 8.3252,

            ...     'ocean_proximity': 'NEAR BAY'

            ... }

            >>> prediction = predictor.predict(data)

            >>> print(f"Predicted price: ${prediction[0]:,.2f}")

        """
        if self.model is None or self.pipeline is None:
            raise RuntimeError("Model not loaded. Call load() first.")
        
        # Convert input to DataFrame if needed
        if isinstance(data, dict):
            data = pd.DataFrame([data])
        elif isinstance(data, list):
            data = pd.DataFrame(data)
        elif not isinstance(data, pd.DataFrame):
            raise TypeError("Input must be a DataFrame, dict, or list of dicts")
        
        # Validate input
        self.validate_input(data)
        
        # Prepare data using the preprocessing pipeline
        prepared_data = self.pipeline.transform(data)
        
        # Make predictions
        predictions = self.model.predict(prepared_data)
        
        return predictions
    
    def predict_single(self, longitude: float, latitude: float, 

                      housing_median_age: float, total_rooms: float,

                      total_bedrooms: float, population: float,

                      households: float, median_income: float,

                      ocean_proximity: str) -> float:
        """

        Convenience method to predict a single house price from individual parameters.

        

        Args:

            longitude: Longitude coordinate

            latitude: Latitude coordinate

            housing_median_age: Median age of houses in the block

            total_rooms: Total number of rooms in the block

            total_bedrooms: Total number of bedrooms in the block

            population: Total population in the block

            households: Total number of households in the block

            median_income: Median income of households (in tens of thousands)

            ocean_proximity: Proximity to ocean ('&lt;1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND')

        

        Returns:

            Predicted house price in dollars

        """
        data = {
            'longitude': longitude,
            'latitude': latitude,
            'housing_median_age': housing_median_age,
            'total_rooms': total_rooms,
            'total_bedrooms': total_bedrooms,
            'population': population,
            'households': households,
            'median_income': median_income,
            'ocean_proximity': ocean_proximity
        }
        
        prediction = self.predict(data)
        return float(prediction[0])


# Convenience functions for quick use
def load_model(model_path: str = "house_price_model.joblib",

               pipeline_path: str = "preprocessing_pipeline.joblib") -> HousePricePredictor:
    """

    Load and return a HousePricePredictor instance.

    

    Args:

        model_path: Path to the trained model joblib file

        pipeline_path: Path to the preprocessing pipeline joblib file

        

    Returns:

        Loaded HousePricePredictor instance

    """
    predictor = HousePricePredictor(model_path, pipeline_path)
    predictor.load()
    return predictor


if __name__ == "__main__":
    # Example usage
    print("Loading model...")
    predictor = load_model()
    
    # Example prediction
    example_data = {
        'longitude': -122.23,
        'latitude': 37.88,
        'housing_median_age': 41.0,
        'total_rooms': 880.0,
        'total_bedrooms': 129.0,
        'population': 322.0,
        'households': 126.0,
        'median_income': 8.3252,
        'ocean_proximity': 'NEAR BAY'
    }
    
    print("\nMaking prediction for example data:")
    print(example_data)
    
    prediction = predictor.predict(example_data)
    print(f"\n✅ Predicted house price: ${prediction[0]:,.2f}")