File size: 8,517 Bytes
f679afb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
"""

House Price Prediction Inference Module



This module provides a simple API for loading the trained California house price

prediction model and making predictions on new data.

"""

import joblib
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Union, Dict, List


class HousePricePredictor:
    """

    A predictor class for California house prices.

    

    This class loads a pre-trained Random Forest model and its preprocessing pipeline,

    and provides methods for making predictions on new housing data.

    """
    
    def __init__(self, model_path: str = "house_price_model.joblib", 

                 pipeline_path: str = "preprocessing_pipeline.joblib"):
        """

        Initialize the predictor by loading the model and preprocessing pipeline.

        

        Args:

            model_path: Path to the trained model joblib file

            pipeline_path: Path to the preprocessing pipeline joblib file

        """
        self.model_path = Path(model_path)
        self.pipeline_path = Path(pipeline_path)
        self.model = None
        self.pipeline = None
        self.feature_names = [
            'longitude', 'latitude', 'housing_median_age', 'total_rooms',
            'total_bedrooms', 'population', 'households', 'median_income',
            'ocean_proximity'
        ]
        self.valid_ocean_proximity = ['<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND']
        
    def load(self):
        """Load the model and preprocessing pipeline from disk."""
        if not self.model_path.exists():
            raise FileNotFoundError(f"Model file not found: {self.model_path}")
        if not self.pipeline_path.exists():
            raise FileNotFoundError(f"Pipeline file not found: {self.pipeline_path}")
            
        self.model = joblib.load(self.model_path)
        self.pipeline = joblib.load(self.pipeline_path)
        print(f"✅ Model loaded successfully from {self.model_path}")
        print(f"✅ Pipeline loaded successfully from {self.pipeline_path}")
        
    def validate_input(self, data: pd.DataFrame):
        """

        Validate that input data has all required features.

        

        Args:

            data: DataFrame with input features

            

        Raises:

            ValueError: If required features are missing or invalid

        """
        missing_features = set(self.feature_names) - set(data.columns)
        if missing_features:
            raise ValueError(f"Missing required features: {missing_features}")
        
        # Validate ocean_proximity values
        invalid_values = set(data['ocean_proximity'].unique()) - set(self.valid_ocean_proximity)
        if invalid_values:
            raise ValueError(
                f"Invalid ocean_proximity values: {invalid_values}. "
                f"Valid values are: {self.valid_ocean_proximity}"
            )
    
    def predict(self, data: Union[pd.DataFrame, Dict, List[Dict]]) -> np.ndarray:
        """

        Make predictions on input data.

        

        Args:

            data: Input data as DataFrame, single dict, or list of dicts.

                  Must contain all required features:

                  - longitude (float): Longitude coordinate

                  - latitude (float): Latitude coordinate

                  - housing_median_age (float): Median age of houses in the block

                  - total_rooms (float): Total number of rooms in the block

                  - total_bedrooms (float): Total number of bedrooms in the block

                  - population (float): Total population in the block

                  - households (float): Total number of households in the block

                  - median_income (float): Median income of households (in tens of thousands)

                  - ocean_proximity (str): Proximity to ocean, one of:

                    '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND'

        

        Returns:

            numpy array of predicted house prices (in dollars)

            

        Example:

            >>> predictor = HousePricePredictor()

            >>> predictor.load()

            >>> data = {

            ...     'longitude': -122.23,

            ...     'latitude': 37.88,

            ...     'housing_median_age': 41.0,

            ...     'total_rooms': 880.0,

            ...     'total_bedrooms': 129.0,

            ...     'population': 322.0,

            ...     'households': 126.0,

            ...     'median_income': 8.3252,

            ...     'ocean_proximity': 'NEAR BAY'

            ... }

            >>> prediction = predictor.predict(data)

            >>> print(f"Predicted price: ${prediction[0]:,.2f}")

        """
        if self.model is None or self.pipeline is None:
            raise RuntimeError("Model not loaded. Call load() first.")
        
        # Convert input to DataFrame if needed
        if isinstance(data, dict):
            data = pd.DataFrame([data])
        elif isinstance(data, list):
            data = pd.DataFrame(data)
        elif not isinstance(data, pd.DataFrame):
            raise TypeError("Input must be a DataFrame, dict, or list of dicts")
        
        # Validate input
        self.validate_input(data)
        
        # Prepare data using the preprocessing pipeline
        prepared_data = self.pipeline.transform(data)
        
        # Make predictions
        predictions = self.model.predict(prepared_data)
        
        return predictions
    
    def predict_single(self, longitude: float, latitude: float, 

                      housing_median_age: float, total_rooms: float,

                      total_bedrooms: float, population: float,

                      households: float, median_income: float,

                      ocean_proximity: str) -> float:
        """

        Convenience method to predict a single house price from individual parameters.

        

        Args:

            longitude: Longitude coordinate

            latitude: Latitude coordinate

            housing_median_age: Median age of houses in the block

            total_rooms: Total number of rooms in the block

            total_bedrooms: Total number of bedrooms in the block

            population: Total population in the block

            households: Total number of households in the block

            median_income: Median income of households (in tens of thousands)

            ocean_proximity: Proximity to ocean ('&lt;1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND')

        

        Returns:

            Predicted house price in dollars

        """
        data = {
            'longitude': longitude,
            'latitude': latitude,
            'housing_median_age': housing_median_age,
            'total_rooms': total_rooms,
            'total_bedrooms': total_bedrooms,
            'population': population,
            'households': households,
            'median_income': median_income,
            'ocean_proximity': ocean_proximity
        }
        
        prediction = self.predict(data)
        return float(prediction[0])


# Convenience functions for quick use
def load_model(model_path: str = "house_price_model.joblib",

               pipeline_path: str = "preprocessing_pipeline.joblib") -> HousePricePredictor:
    """

    Load and return a HousePricePredictor instance.

    

    Args:

        model_path: Path to the trained model joblib file

        pipeline_path: Path to the preprocessing pipeline joblib file

        

    Returns:

        Loaded HousePricePredictor instance

    """
    predictor = HousePricePredictor(model_path, pipeline_path)
    predictor.load()
    return predictor


if __name__ == "__main__":
    # Example usage
    print("Loading model...")
    predictor = load_model()
    
    # Example prediction
    example_data = {
        'longitude': -122.23,
        'latitude': 37.88,
        'housing_median_age': 41.0,
        'total_rooms': 880.0,
        'total_bedrooms': 129.0,
        'population': 322.0,
        'households': 126.0,
        'median_income': 8.3252,
        'ocean_proximity': 'NEAR BAY'
    }
    
    print("\nMaking prediction for example data:")
    print(example_data)
    
    prediction = predictor.predict(example_data)
    print(f"\n✅ Predicted house price: ${prediction[0]:,.2f}")