Skip to main content
Version: v1.4.1

Regression - House Prices Advanced Regression

Predicting house prices using advanced regression techniques with comprehensive feature engineering.

Dataset Source: Kaggle House Prices Competition Problem Type: Regression Target Variable: SalePrice - Final sale price of houses Use Case: Real estate valuation, property investment analysis, market trend prediction

Package Imports

1import pandas as pd
2import xplainable as xp
3from xplainable.core.models import XRegressor
4from xplainable.core.optimisation.genetic import XEvolutionaryNetwork
5from xplainable.core.optimisation.layers import Evolve, Tighten
6from xplainable_preprocessing import PipelineSpec, StepSpec, compile_spec
7from sklearn.model_selection import train_test_split
8import requests
9import json
10
11# Additional imports specific to this example
12import numpy as np
13import matplotlib.pyplot as plt
14import seaborn as sns
15
16from xplainable_client.client.client import XplainableClient
17from xplainable_client.client.base import XplainableAPIError
1!pip install xplainable
2!pip install xplainable-client

Xplainable Cloud Setup

1# Initialize Xplainable Cloud client
2client = XplainableClient(
3 api_key="", #Create api key in xplainable cloud - https://platform.xplainable.io/
4 hostname="https://platform.xplainable.io"
5)

Data Loading and Exploration

Load the House Prices dataset from Kaggle.

Note: Download the dataset from Kaggle or use the Kaggle API.

1# Load dataset
2try:
3 # If you have downloaded the dataset manually
4 train_df = pd.read_csv('train.csv')
5 test_df = pd.read_csv('test.csv')
6
7 # Display basic information
8 print(f"Training dataset shape: {train_df.shape}")
9 print(f"Test dataset shape: {test_df.shape}")
10 print(f"Target variable statistics:")
11 print(train_df['SalePrice'].describe())
12
13 train_df.head()
14
15except FileNotFoundError:
16 print("Dataset files not found. Please download from Kaggle:")
17 print("https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data")
18 print("Or use: kaggle competitions download -c house-prices-advanced-regression-techniques")
19
20 # For testing purposes, create a simple synthetic dataset
21 print("\nCreating synthetic dataset for testing...")
22 import numpy as np
23 np.random.seed(42)
24
25 # Create a simple synthetic housing dataset
26 n_samples = 1000
27 train_df = pd.DataFrame({
28 'LotArea': np.random.normal(10000, 2000, n_samples),
29 'YearBuilt': np.random.randint(1950, 2020, n_samples),
30 'TotalBsmtSF': np.random.normal(1000, 300, n_samples),
31 'GrLivArea': np.random.normal(1500, 400, n_samples),
32 'GarageCars': np.random.randint(0, 4, n_samples),
33 'Neighborhood': np.random.choice(['A', 'B', 'C', 'D'], n_samples),
34 'HouseStyle': np.random.choice(['1Story', '2Story', 'Split'], n_samples)
35 })
36
37 # Create target variable with realistic relationship
38 train_df['SalePrice'] = (
39 train_df['LotArea'] * 0.01 +
40 train_df['GrLivArea'] * 80 +
41 train_df['TotalBsmtSF'] * 30 +
42 train_df['GarageCars'] * 5000 +
43 np.random.normal(0, 10000, n_samples)
44 ).astype(int)
45
46 print(f"Synthetic dataset created: {train_df.shape}")
47 print(f"Target variable statistics:")
48 print(train_df['SalePrice'].describe())

1. Data Preprocessing

Handle missing values, encode categorical variables, and engineer features.

1# Define preprocessing pipeline using PipelineSpec
2steps = []
3
4# Drop non-informative 'Id' column if present
5if 'Id' in train_df.columns:
6 steps.append(StepSpec(
7 transformer="DropColumnsTransformer",
8 params={"columns": ["Id"]}
9 ))
10
11# Fill missing numeric values with median, categorical with mode
12steps.append(StepSpec(
13 transformer="FillMissingTransformer",
14 params={"strategy": "median"}
15))
16
17# Build and compile the pipeline spec
18spec = PipelineSpec(steps=steps)
19pipeline = compile_spec(spec)
20
21# Apply preprocessing
22df_processed = pipeline.fit_transform(train_df)
23
24# Fill any remaining missing categorical values with mode
25df_processed = df_processed.fillna(df_processed.mode().iloc[0])
26
27print(f"Processed dataset shape: {df_processed.shape}")
28print(f"Missing values: {df_processed.isnull().sum().sum()}")
29df_processed.head()

Preprocessor Persistence

Save the preprocessing pipeline spec to Xplainable Cloud for reproducibility.

1# Persist the preprocessor to Xplainable Cloud
2# Uncomment to save preprocessor
3# try:
4# preprocessor_id = client.preprocessing.create_preprocessor(
5# spec=spec,
6# name="House Prices Preprocessor",
7# description="Drops Id column, fills missing numeric values with median"
8# )
9# print(f"Preprocessor created with ID: {preprocessor_id}")
10# except XplainableAPIError as e:
11# print(f"Error creating preprocessor: {e}")

Create Train/Test Split

1X, y = df_processed.drop(columns=['SalePrice']), df_processed['SalePrice']
2
3X_train, X_test, y_train, y_test = train_test_split(
4 X, y, test_size=0.2, random_state=42
5)

2. Model Optimization

Optimize the model using genetic algorithms with evolutionary networks for house price prediction.

1# First train a base model
2model = XRegressor()
3model.fit(X_train, y_train)
4
5# Create evolutionary network for optimization
6network = XEvolutionaryNetwork(model)
7
8# Add optimization layers
9# Start with an initial Tighten layer
10network.add_layer(
11 Tighten(
12 iterations=100,
13 learning_rate=0.1,
14 early_stopping=20
15 )
16)
17
18# Add an Evolve layer with high severity for exploration
19network.add_layer(
20 Evolve(
21 mutations=100,
22 generations=50,
23 max_severity=0.5,
24 max_leaves=20,
25 early_stopping=20
26 )
27)
28
29# Add another Evolve layer with lower severity for refinement
30network.add_layer(
31 Evolve(
32 mutations=100,
33 generations=50,
34 max_severity=0.3,
35 max_leaves=15,
36 early_stopping=20
37 )
38)
39
40# Add a final Tighten layer with low learning rate for fine-tuning
41network.add_layer(
42 Tighten(
43 iterations=100,
44 learning_rate=0.025,
45 early_stopping=20
46 )
47)
48
49# Fit the network and run optimization
50network.fit(X_train, y_train)
51network.optimise()

3. Model Training

The model has been trained and optimized through the evolutionary network process.

1# Model is already trained through the evolutionary network
2# Let's evaluate the optimized model performance
3train_performance = model.evaluate(X_train, y_train)
4print("Training Performance:")
5for metric, value in train_performance.items():
6 print(f"{metric}: {value:.4f}")

4. Model Interpretability and Explainability

Analyze which features most influence house price predictions.

1model.explain()

5. Model Persistence (Optional)

Save the model to Xplainable Cloud for collaboration and deployment.

1Uncomment to save model to Xplainable Cloud
2try:
3 model_id, version_id = client.models.create_model(
4 model=model,
5 model_name="House Prices Regression Model",
6 model_description="Predicting house sale prices using property characteristics",
7 x=X_train,
8 y=y_train
9 )
10except XplainableAPIError as e:
11 print(f"Error creating model: {e}")

6. Model Deployment (Optional)

Deploy the model for real-time predictions.

1Uncomment to deploy model
2try:
3 deployment_response = client.deployments.deploy(
4 model_version_id=version_id
5 )
6 deployment_id = deployment_response.deployment_id
7except XplainableAPIError as e:
8 print(f"Error deploying model: {e}")
9
10# Activate deployment
11try:
12 client.deployments.activate_deployment(deployment_id=deployment_id)
13except XplainableAPIError as e:
14 print(f"Error activating deployment: {e}")
15
16# Generate deploy key for inference
17try:
18 deploy_key = client.deployments.generate_deploy_key(
19 deployment_id=deployment_id,
20 name="House Prices Deployment Key"
21 )
22 print(f"Deploy key created: {str(deploy_key)}")
23except XplainableAPIError as e:
24 print(f"Error generating deploy key: {e}")
25
26# Generate example deployment payload
27try:
28 example_payload = client.deployments.generate_example_deployment_payload(
29 model_version_id=version_id
30 )
31 print("Example payload:", example_payload)
32except XplainableAPIError as e:
33 print(f"Error generating example payload: {e}")

7. Model Testing

Evaluate model performance on test data.

1# Evaluate on test set
2test_predictions = model.predict(X_test)
3test_performance = model.evaluate(X_test, y_test)
4
5print("Test Set Performance:")
6for metric, value in test_performance.items():
7 print(f"{metric}: {value:.4f}")
8
9# Plot predictions vs actual
10plt.figure(figsize=(10, 6))
11plt.scatter(y_test, test_predictions, alpha=0.5)
12plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
13plt.xlabel('Actual Sale Price')
14plt.ylabel('Predicted Sale Price')
15plt.title('House Price Predictions vs Actual Values')
16plt.show()