Skip to main content
Version: v1.4.1

Regression - Dozer Price Prediction

Brief description of predicting bulldozer sale prices using the Bluebook dataset.

Dataset Source: Kaggle Bluebook for Bulldozers Problem Type: Regression Target Variable: SalePrice - The sale price of the bulldozer at auction Use Case: Price prediction for heavy equipment, identifying arbitrage opportunities in equipment sales

Package Imports

1!pip install xplainable
2!pip install xplainable-client
1import pandas as pd
2import xplainable as xp
3from xplainable.core.models import XRegressor
4from xplainable.core.optimisation.genetic import XEvolutionaryNetwork
5from xplainable.core.optimisation.layers import Evolve, Tighten
6from xplainable_preprocessing import PipelineSpec, StepSpec, compile_spec
7from sklearn.model_selection import train_test_split
8import requests
9import json
10
11# Additional imports specific to this example
12import numpy as np
13import matplotlib.pyplot as plt
14import seaborn as sns
15from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score, mean_squared_error, explained_variance_score, mean_squared_log_error
16
17from xplainable_client.client.client import XplainableClient
18from xplainable_client.client.base import XplainableAPIError
1print(f"This notebook was created using Xplainable version {xp.__version__}")
Out:

This notebook was created using Xplainable version 1.3.0

Xplainable Cloud Setup

1from sklearn.metrics import mean_absolute_error
2import numpy as np
3import matplotlib.pyplot as plt
4import pandas as pd
5
6def plot_error(model, x, y, alpha=0.5, color_column=None):
7 fig, ax = plt.subplots(figsize=(12, 8))
8
9 y_pred = model.predict(x)
10 mae = mean_absolute_error(y, y_pred)
11 errors = abs(y - y_pred)
12
13 if color_column is not None:
14 if color_column not in x.columns:
15 raise ValueError(f"The color_column {color_column} is not in the DataFrame.")
16
17 # Convert column to categorical and get codes and unique values
18 categories = x[color_column].astype('category').cat.categories
19 codes = x[color_column].astype('category').cat.codes
20 unique_codes = np.unique(codes)
21 scatter = ax.scatter(y, y_pred, c=codes, alpha=alpha, cmap='plasma')
22
23 # Create a legend with the actual category labels
24 handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=scatter.cmap(scatter.norm(code)),
25 markersize=10) for code in unique_codes]
26 ax.legend(handles, categories, title=color_column)
27
28 else:
29 scatter = ax.scatter(y, y_pred, c=errors, alpha=alpha, cmap='plasma')
30 plt.colorbar(scatter, ax=ax, label='Absolute Error')
31
32 # Line for perfect predictions
33 max_val = np.maximum(y.max(), y_pred.max())
34 ax.plot([0, max_val], [0, max_val], 'k--', lw=2)
35
36 # Labels and title
37 ax.set_xlabel('True Values')
38 ax.set_ylabel('Predicted Values')
39 ax.set_title(f'Scatter Plot of True vs Predicted Values with MAE: {mae:.2f}')
40
41 plt.show()
42
43# Example usage:
44# plot_error(model, X_train, y_train, alpha=0.5, color_column='ModelID')

Data Loading and Exploration

Load the Bluebook for Bulldozers dataset

It's possible to download the Bluebook dozer price prediction dataset at the following link: https://www.kaggle.com/c/bluebook-for-bulldozers/data

Following extraction of the .zip file build the dataset as below:

1# Load dataset
2df = pd.read_csv('https://xplainable-public-storage.syd1.digitaloceanspaces.com/example_data/TrainAndValid.csv', parse_dates=['saledate'])
3
4# Display basic information
5print(f"Dataset shape: {df.shape}")
6df.head()
SalesIDSalePriceMachineIDModelIDdatasourceauctioneerIDYearMadeMachineHoursCurrentMeterUsageBandsaledate...Undercarriage_Pad_WidthStick_LengthThumbPattern_ChangerGrouser_TypeBackhoe_MountingBlade_TypeTravel_ControlsDifferential_TypeSteering_Controls
011392466600099908931571213200468Low2006-11-16...nannannannannannannannanStandardConventional
111392485700011765777121319964640Low2004-03-26...nannannannannannannannanStandardConventional
21139249100004348087009121320012838High2004-02-26...nannannannannannannannannannan
31139251385001026470332121320013486High2011-05-19...nannannannannannannannannannan
411392531100010573731731112132007722Medium2009-07-23...nannannannannannannannannannan
Out:

Dataset shape: (412698, 53)

1df.head()
SalesIDSalePriceMachineIDModelIDdatasourceauctioneerIDYearMadeMachineHoursCurrentMeterUsageBandsaledate...Undercarriage_Pad_WidthStick_LengthThumbPattern_ChangerGrouser_TypeBackhoe_MountingBlade_TypeTravel_ControlsDifferential_TypeSteering_Controls
011392466600099908931571213200468Low2006-11-16...nannannannannannannannanStandardConventional
111392485700011765777121319964640Low2004-03-26...nannannannannannannannanStandardConventional
21139249100004348087009121320012838High2004-02-26...nannannannannannannannannannan
31139251385001026470332121320013486High2011-05-19...nannannannannannannannannannan
411392531100010573731731112132007722Medium2009-07-23...nannannannannannannannannannan

Add the machine appendix to concatenate information about the dozer assets

1ma = pd.read_csv('https://xplainable-public-storage.syd1.digitaloceanspaces.com/example_data/Machine_Appendix.csv')
1ma.head()
MachineIDModelIDfiModelDescfiBaseModelfiSecondaryDescfiModelSeriesfiModelDescriptorfiProductClassDescProductGroupProductGroupDescMfgYearfiManufacturerIDfiManufacturerDescPrimarySizeBasisPrimaryLowerPrimaryUpper
01131355350L350nannanLHydraulic Excavator, Track - 50.0 to 66.0 Metr...TEXTrack Excavators199426CaterpillarWeight - Metric Tons5066
14343538416C416CnannanBackhoe Loader - 14.0 to 15.0 Ft Standard Digg...BLBackhoe Loaders199726CaterpillarStandard Digging Depth - Ft1415
25343538416C416CnannanBackhoe Loader - 14.0 to 15.0 Ft Standard Digg...BLBackhoe Loaders199826CaterpillarStandard Digging Depth - Ft1415
37183538416C416CnannanBackhoe Loader - 14.0 to 15.0 Ft Standard Digg...BLBackhoe Loaders200026CaterpillarStandard Digging Depth - Ft1415
417531580D5GLGPD5GnanLGPTrack Type Tractor, Dozer - 85.0 to 105.0 Hors...TTTTrack Type Tractors200626CaterpillarHorsepower85105

Merging the dataset on the MachineID to extract useful information:

  • Find the columns that exist within the machine dictionary that aren't in the training dataset
  • Merge the new columns on the existing train dataset to enrich the information
1new_cols = [col for col in ma.columns if col not in df.columns]
2ma[new_cols].head()
MfgYearfiManufacturerIDfiManufacturerDescPrimarySizeBasisPrimaryLowerPrimaryUpper
0199426CaterpillarWeight - Metric Tons5066
1199726CaterpillarStandard Digging Depth - Ft1415
2199826CaterpillarStandard Digging Depth - Ft1415
3200026CaterpillarStandard Digging Depth - Ft1415
4200626CaterpillarHorsepower85105
1merge_col = "MachineID"
2df = pd.merge(df, ma[[merge_col]+ new_cols ],on='MachineID', how='left')

1. Data Preprocessing

Feature Engineering and Data Preparation

1# Define preprocessing pipeline using PipelineSpec
2# Step 1: Extract date features from saledate
3df['saleyear'] = df['saledate'].dt.year
4df['salemonth'] = df['saledate'].dt.month
5df['saledayofweek'] = df['saledate'].dt.day_name()
6
7# Step 2: Build and apply the PipelineSpec for remaining transformations
8spec = PipelineSpec(steps=[
9 # Drop the saledate column after feature extraction
10 StepSpec(
11 transformer="DropColumnsTransformer",
12 params={"columns": ["saledate"]}
13 ),
14 # Rename columns to remove underscores (matching original behavior)
15 StepSpec(
16 transformer="RenameColumnsTransformer",
17 params={"mapping": {col: col.replace("_", "") for col in df.columns if "_" in col}}
18 ),
19 # Cast ModelID to string so it doesn't create regression splits
20 StepSpec(
21 transformer="TypeCastTransformer",
22 params={"dtypes": {"ModelID": "str"}}
23 ),
24])
25
26pipeline = compile_spec(spec)
27df = pipeline.fit_transform(df)
28
29# Filter out erroneous purchase values
30df = df[df.YearMade > 1920]
31
32print(f"Processed dataset shape: {df.shape}")
33df.head()

Preprocessor Persistence

Save the preprocessing pipeline spec to Xplainable Cloud for reproducibility.

1# Persist the preprocessor to Xplainable Cloud
2# Uncomment to save preprocessor
3# try:
4# preprocessor_id = client.preprocessing.create_preprocessor(
5# spec=spec,
6# name="Dozer Price Prediction Preprocessor",
7# description="Drops saledate, renames columns, casts ModelID to string"
8# )
9# print(f"Preprocessor created with ID: {preprocessor_id}")
10# except XplainableAPIError as e:
11# print(f"Error creating preprocessor: {e}")
1df
SalesIDSalePriceMachineIDModelIDdatasourceauctioneerIDYearMadeMachineHoursCurrentMeterUsageBandfiModelDesc...SteeringControlsMfgYearfiManufacturerIDfiManufacturerDescPrimarySizeBasisPrimaryLowerPrimaryUppersaleyearsalemonthsaledayofweek
0113924666000.099908931571213.0200468.0Low521D...Conventional2004.025CaseHorsepower110.0120.0200611Thursday
1113924857000.0117657771213.019964640.0Low950FII...Conventional1996.026CaterpillarHorsepower150.0175.020043Friday
2113924910000.043480870091213.020012838.0High226...nan2001.026CaterpillarOperating Capacity - Lbs1351.01601.020042Thursday
3113925138500.010264703321213.020013486.0HighPC120-6E...nan2010.0103KomatsuHorsepower225.0250.020115Thursday
4113925311000.01057373173111213.02007722.0MediumS175...nan2007.0121BobcatOperating Capacity - Lbs1601.01751.020097Thursday
..................................................................
412693633334410000.01919201214351492.02005nannan30NX...nan2005.02552IHIWeight - Metric Tons2.03.020123Wednesday
412694633334510500.01882122214361492.02005nannan30NX2...nan2005.02552IHIWeight - Metric Tons3.04.020121Saturday
412695633334712500.01944213214351492.02005nannan30NX...nan2005.02552IHIWeight - Metric Tons2.03.020121Saturday
412696633334810000.01794518214351492.02006nannan30NX...nan2006.02552IHIWeight - Metric Tons2.03.020123Wednesday
412697633334913000.01944743214361492.02006nannan30NX2...nan2005.02552IHIWeight - Metric Tons2.03.020121Saturday

Train on the top 6 dozers assets by count

For timeliness of training filter the data on the Top 6 assets by count

1models_to_train = df.ModelID.value_counts().index[:10].to_list()
1models_to_train
Out:

['4605',

'3538',

'4604',

'3170',

'3362',

'3537',

'4603',

'3171',

'3357',

'3178']

1df[df.ModelID.isin(models_to_train)]
SalesIDSalePriceMachineIDModelIDdatasourceauctioneerIDYearMadeMachineHoursCurrentMeterUsageBandfiModelDesc...SteeringControlsMfgYearfiManufacturerIDfiManufacturerDescPrimarySizeBasisPrimaryLowerPrimaryUppersaleyearsalemonthsaledayofweek
5113925526500.0100127446051213.02004508.0Low310G...nan2004.043John DeereStandard Digging Depth - Ft14.015.0200812Thursday
10113927824000.0102499846051213.020041414.0Medium310G...nan2004.043John DeereStandard Digging Depth - Ft14.015.020088Thursday
15113929119000.0100481046041213.019992450.0Medium310E...nan1999.043John DeereStandard Digging Depth - Ft14.015.0200611Thursday
62113946923000.0105886931711213.019989987.0High580L...nan1998.025CaseStandard Digging Depth - Ft14.015.020075Thursday
82113951533000.0101556546051213.020021268.0Medium310G...nan2002.043John DeereStandard Digging Depth - Ft14.015.020047Thursday
..................................................................
410243628823918200.01835461460414999.0200048.0Low310E...nan2000.043John DeereStandard Digging Depth - Ft14.015.020122Wednesday
410244628824025250.0190391446051490.020051988.0Low310G...nan2005.043John DeereStandard Digging Depth - Ft14.015.020121Saturday
410245628824125250.01860549460514999.02006nannan310G...nan2006.043John DeereStandard Digging Depth - Ft14.015.020124Wednesday
410246628824325000.0184618446051491.02006nannan310G...nan2006.043John DeereStandard Digging Depth - Ft14.015.020123Thursday
410264628834620500.0186708746041494.02000nannan310E...nan2000.043John DeereStandard Digging Depth - Ft14.015.020122Monday
1data = df[df.ModelID.isin(models_to_train)]
2m = data.isna().sum()
3data = data[[col for col in data.columns if col not in data.columns[m == len(data)]]]
4
5#Drop cols cardinality of 1
6s = data.nunique()
7car_cols = data.columns[(s == 1)]
8data = data.drop(columns=car_cols)
9
10#Update numeric columns to be float64
11n_cols = data.select_dtypes(include=np.number).columns.tolist()
12data[n_cols] = data[n_cols].astype('float64')
1data.head()
SalesIDSalePriceMachineIDModelIDdatasourceauctioneerIDYearMadeMachineHoursCurrentMeterUsageBandfiModelDesc...TireSizeMfgYearfiManufacturerIDfiManufacturerDescPrimarySizeBasisPrimaryLowerPrimaryUppersaleyearsalemonthsaledayofweek
51.13926e+06265001.00127e+06460512132004508Low310G...nan200443John DeereStandard Digging Depth - Ft1415200812Thursday
101.13928e+06240001.025e+064605121320041414Medium310G...nan200443John DeereStandard Digging Depth - Ft141520088Thursday
151.13929e+06190001.00481e+064604121319992450Medium310E...nan199943John DeereStandard Digging Depth - Ft1415200611Thursday
621.13947e+06230001.05887e+063171121319989987High580L...nan199825CaseStandard Digging Depth - Ft141520075Thursday
821.13952e+06330001.01556e+064605121320021268Medium310G...nan200243John DeereStandard Digging Depth - Ft141520047Thursday

Addressing Multicollinearity in Model Interpretability

It's well-understood in data science that multicollinearity can significantly hamper the interpretability of models, particularly those based on linear assumptions. The code snippet above demonstrates a rudimentary approach to mitigating multicollinearity by removing highly correlated features. However, it's important to acknowledge that this is a simplified illustration; in practice, the interplay between features can be more subtle and complex.

For robust feature selection and to enhance model explainability, we employ automated feature selection techniques that are thoroughly documented in our project's documentation. These methods go beyond pairwise correlations, considering the multidimensional structure of the data to retain the most informative features. While the current example is not exhaustive, it serves to highlight a fundamental step in preprocessing for linear models. Practitioners are encouraged to leverage our automatic feature selection capabilities to refine their models further and to ensure that the explanatory variables employed are truly reflective of independent factors influencing the response variable.

1data["AgeAtSale"] = df["saleyear"] - df["MfgYear"]
1drop_cols = [
2 # "saleyear", #--> Data encoded in Age at Sale
3 "MfgYear", #--> Data encoded in Age at Sale
4 "YearMade", #--> Multicollinearity with MfgYear
5 ]
1target = 'SalePrice'
2id_columns=["SalesID",'MachineID','auctioneerID','datasource']

Split the train and validation set

1data_train = data[data.saleyear!=2012]
2data_val = data[data.saleyear==2012]
3data_train= data_train.drop(columns=drop_cols)
4data_val=data_val.drop(columns=drop_cols)
5
6#Create the training and validation set
7X_train, y_train = data_train.drop('SalePrice', axis=1), data_train['SalePrice']
8X_valid, y_valid = data_val.drop('SalePrice', axis=1), data_val['SalePrice']
1model = XRegressor(ignore_nan=False)

2. Model Training

Initial Model Training

1model.fit(X_train, y_train, id_columns=id_columns)
Out:

<xplainable.core.ml.regression.XRegressor at 0x115b177f0>

1model.evaluate(X_train, y_train)
Out:

&#123;'Explained Variance': 0.8476,

'MAE': 4292.3689,

'MAPE': 0.1616,

'MSE': 39079631.4865,

'RMSE': 6251.3704,

'RMSLE': nan,

'R2 Score': 0.8476&#125;

1model.explain()

3. Model Optimization

Evolutionary Network Optimization

1network = XEvolutionaryNetwork(model)
2
3# Add the layers
4# Start with an initial Tighten layer
5network.add_layer(
6 Tighten(
7 iterations=100,
8 learning_rate=0.1,
9 early_stopping=20
10 )
11 )
12
13# Add an Evolve layer with a high severity
14network.add_layer(
15 Evolve(
16 mutations=100,
17 generations=50,
18 max_severity=0.5,
19 max_leaves=20,
20 early_stopping=20
21 )
22 )
23
24# Add another Evolve layer with a lower severity and reach
25network.add_layer(
26 Evolve(
27 mutations=100,
28 generations=50,
29 max_severity=0.3,
30 max_leaves=15,
31 early_stopping=20
32 )
33 )
34
35# Add a final Tighten layer with a low learning rate
36network.add_layer(
37 Tighten(
38 iterations=100,
39 learning_rate=0.025,
40 early_stopping=20
41 )
42 )
43
44# Fit the network (before or after adding layers)
45network.fit(X_train.drop(columns=id_columns), y_train)
46
47# Run the network
48network.optimise()
Out:

0%| | 0/100 [00:00<?, ?it/s]

0%| | 0/50 [00:00<?, ?it/s]

0%| | 0/50 [00:00<?, ?it/s]

0%| | 0/100 [00:00<?, ?it/s]

<xplainable.core.optimisation.genetic.XEvolutionaryNetwork at 0x2d722ac80>

1model.evaluate(X_train, y_train)
Out:

&#123;'Explained Variance': 0.8442,

'MAE': 4127.0074,

'MAPE': 0.1476,

'MSE': 40223162.9493,

'RMSE': 6342.1734,

'RMSLE': nan,

'R2 Score': 0.8431&#125;

Simply by fitting a combination of 6 Tighten and Evolution layers we have decreased the MAE by approximately 90. Play around with more layers to see if it's possible to obtain better results.

1plot_error(model, X_train, y_train)

Comparing against the validation set

1model.evaluate(X_valid, y_valid)
Out:

&#123;'Explained Variance': 0.8458,

'MAE': 4845.2839,

'MAPE': 0.1778,

'MSE': 45839082.7702,

'RMSE': 6770.4566,

'RMSLE': nan,

'R2 Score': 0.8071&#125;

1plot_error(model, X_valid, y_valid)

4. Model Interpretability and Explainability

Model Feature Importance Analysis

Explaining the variance in the Error Plot

Prior to examining the detailed error plot, it is essential to consider the real-world operational differences among various bulldozer models, as well as the insights provided by subject matter experts (SMEs). These differences are likely to manifest as distinct groupings in the predicted versus actual results. Each model type's unique characteristics—such as age, usage and maintenance history factors that could create these groups, affecting the sale prices and thus the prediction accuracy. Recognizing these potential variances will prepare us to understand and address the disparities in the predictive performance across different Model IDs that the following plot will reveal.

1plot_error(model, X_train, y_train, alpha=0.4, color_column="ModelID")

Insights from Scatter Plot Analysis

The scatter plot displayed above demonstrates a significant variation in the predictive accuracy across different Model IDs, as indicated by the spread of points in relation to the black dashed line, which represents perfect prediction. Models such as those in the yellow cluster are closely aligned with the line, suggesting higher prediction accuracy for these Model IDs. This observation underscores the importance of partitioning the dataset to develop model-specific predictive algorithms. By doing so, we can account for the unique characteristics of each model, which may include factors specific to the model that affect the score contributions.

5. Model Persistence

Save Model to Xplainable Cloud

Step 1: Instantiate the Client

Connect to the Xplainable API using your provided API key and the local hostname. This allows further interaction with the platform for model creation and deployment.

1# Initialize Xplainable Cloud client
2client = XplainableClient(
3 api_key="5c17e3e0-8369-498a-af23-7a94f3668ed4", #Create api key in xplainable cloud - https://platform.xplainable.io/
4 # hostname="https://platform.xplainable.io"
5 hostname="http://localhost:8000"
6)
Out:

<Response [200]>

Step 2: Create a Model

Define and create a machine learning model on the Xplainable platform. This includes setting a name, description, and providing training features (X_train) and targets (y_train).

1# Create a model
2try:
3 model_id, version_id = client.models.create_model(
4 model=model,
5 model_name="Dozer Price Prediction",
6 model_description="Predicting the price of a different dozer types",
7 x=X_train,
8 y=y_train
9 )
10except XplainableAPIError as e:
11 print(f"Error creating model: {e}")
Out:

0%| | 0/41 [00:00<?, ?it/s]

6. Model Deployment

Deploy Model for Inference

1try:
2 deployment_response = client.deployments.deploy(
3 model_version_id=version_id #<- Use version id produced above
4 )
5 deployment_id = deployment_response.deployment_id
6except XplainableAPIError as e:
7 print(f"Error deploying model: {e}")

Step 4: Activate the Deployment

Activate the model deployment so that it’s ready to receive inference requests.

1try:
2 client.deployments.activate_deployment(deployment_id=deployment_id)
3except XplainableAPIError as e:
4 print(f"Error activating deployment: {e}")

Step 5: Generate a Deploy Key

Generate an API deploy key for secure access to the deployed model. This key will be used to authenticate when making prediction requests.

1try:
2 deploy_key = client.deployments.generate_deploy_key(
3 deployment_id=deployment_id,
4 description='API key for Dozer Price Prediction',
5 days_until_expiry=30
6 )
7 print(f"Deploy key created: {str(deploy_key)}")
8except XplainableAPIError as e:
9 print(f"Error generating deploy key: {e}")

Step 6: Format a Sample Input

Prepare a single test sample (excluding the target column SalePrice) to be used for model inference. This sample is converted to JSON format for use in an API call.

1body = json.loads(
2 df[df.ModelID == "4605"].drop(columns=["SalePrice"]).sample(1).to_json(orient="records")
3 )
4
1body
Out:

[&#123;'SalesID': 1638768,

'MachineID': 1520710,

'ModelID': '4605',

'datasource': 132,

'auctioneerID': 1.0,

'YearMade': 2001,

'MachineHoursCurrentMeter': None,

'UsageBand': None,

'fiModelDesc': '310G',

'fiBaseModel': '310',

'fiSecondaryDesc': 'G',

'fiModelSeries': None,

'fiModelDescriptor': None,

'ProductSize': None,

'fiProductClassDesc': 'Backhoe Loader - 14.0 to 15.0 Ft Standard Digging Depth',

'state': 'Maryland',

'ProductGroup': 'BL',

'ProductGroupDesc': 'Backhoe Loaders',

'DriveSystem': 'Four Wheel Drive',

'Enclosure': 'EROPS',

'Forks': 'None or Unspecified',

'PadType': 'None or Unspecified',

'RideControl': 'No',

'Stick': 'Extended',

'Transmission': 'Standard',

'Turbocharged': 'None or Unspecified',

'BladeExtension': None,

'BladeWidth': None,

'EnclosureType': None,

'EngineHorsepower': None,

'Hydraulics': None,

'Pushblock': None,

'Ripper': None,

'Scarifier': None,

'TipControl': None,

'TireSize': None,

'Coupler': None,

'CouplerSystem': None,

'GrouserTracks': None,

'HydraulicsFlow': None,

'TrackType': None,

'UndercarriagePadWidth': None,

'StickLength': None,

'Thumb': None,

'PatternChanger': None,

'GrouserType': None,

'BackhoeMounting': None,

'BladeType': None,

'TravelControls': None,

'DifferentialType': None,

'SteeringControls': None,

'MfgYear': 2001.0,

'fiManufacturerID': 43,

'fiManufacturerDesc': 'John Deere',

'PrimarySizeBasis': 'Standard Digging Depth - Ft',

'PrimaryLower': 14.0,

'PrimaryUpper': 15.0,

'saleyear': 2004,

'salemonth': 10,

'saledayofweek': 'Wednesday'&#125;]

Step 7: Send a Prediction Request

Make a POST request to the Xplainable inference endpoint with the sample input. The deploy_key is included in the headers for authentication, and the model returns a prediction based on the JSON-formatted input data.

1response = requests.post(
2 url="https://inference.xplainable.io/v1/predict",
3 headers={'api_key': str(deploy_key)},
4 json=body
5)
6
7value = response.json()
8print("Prediction result:", value)

7. Partitioned Models

Enhanced Model Performance with Partitioning

The Power of Partitioned Models in Price Prediction

When predicting prices for heavy equipment like in the Bluebook Dozer Price Prediction challenge, one-size-fits-all models often fall short. Different equipment models (ModelIDs) can have vastly different characteristics—age, usage patterns, depreciation curves, and market dynamics. Trying to capture all of that in a single global model can dilute performance.

What is a Partitioned Model?

A partitioned model means training separate models for each subgroup or partition in the data—in this case, for each unique ModelID. Instead of fitting one global model to the entire dataset, you're allowing the model to specialize based on contextual differences.

In Xplainable, this can be achieved seamlessly by training per-group models through the auto-training UI or the client.

1from xplainable.core.models import PartitionedRegressor, XRegressor
2
3# Train your model (this will open an embedded gui)
4partitioned_model = PartitionedRegressor(partition_on='ModelID')
5
6# Iterate over the unique values in the partition column
7for partition in df.ModelID.value_counts().index[:10].to_list():
8 # Get the data for the partition
9 part = data[data['ModelID'] == partition].drop(columns=drop_cols)
10
11 # IMPORTANT: Exclude the partition column (ModelID) from features
12 partition_features = [col for col in part.columns if col not in ['SalePrice', 'ModelID']]
13 x_train_partition = part[partition_features]
14 y_train_partition = part['SalePrice']
15
16 # Fit the embedded model
17 model_partition = XRegressor()
18 model_partition.fit(x_train_partition, y_train_partition, id_columns=id_columns)
19
20 # Add the model to the partitioned model
21 partitioned_model.add_partition(model_partition, partition)
22
23# IMPORTANT: Add the __dataset__ partition (full dataset model)
24# Also exclude ModelID from the full dataset features
25full_dataset_features = [col for col in X_train.columns if col != 'ModelID']
26X_train_no_modelid = X_train[full_dataset_features]
27
28full_model = XRegressor()
29full_model.fit(X_train_no_modelid, y_train, id_columns=id_columns)
30partitioned_model.add_partition(full_model, '__dataset__')
31
32# Now you can predict on the partitioned model
33# The prediction will use ModelID for routing, but not as a feature
34y_pred = partitioned_model.predict(X_valid)
1plot_error(partitioned_model, X_train, y_train, color_column="ModelID")

1plot_error(partitioned_model, X_valid, y_valid, color_column="ModelID")

Evaluation of Model Predictions Against Validation Data

The scatter plot illustrates our model's performance on the validation set, comparing the true values against the predicted values for various bulldozer models. While the trend line shows that our model predictions are generally aligned with the true values, there is an observable underprediction across the data points, as evidenced by the mean absolute error (MAE) of 3599 vs 3212 on the train.

Considerations for Model Refinement:

  • The impact of the mining boom in Australia in 2012, referenced from the Reserve Bank of Australia's report, suggests an economic context that may influence equipment prices. Incorporating macroeconomic indicators could potentially enhance the model's predictive accuracy.

  • Introducing time series features that capture year-over-year changes could offer a more nuanced understanding of price fluctuations over time, rather than relying solely on 'Age at Sale', which may not fully encapsulate such trends.

These considerations point towards the inclusion of external economic factors and more sophisticated time-based features to improve the model's prediction capabilities. Further analysis and iterative model tuning will be required to reduce the prediction error and align the model outputs more closely with the validation data.

Further Investigation:

  • An analysis of the trend line derived from time series splits (Age at Sale) could reveal insights into future forecasting capabilities. By extending this trend line, we can project forward forecasts that anticipate equipment prices. This approach could be particularly beneficial for capturing the trajectory of market shifts influenced by macroeconomic trends, such as the mining boom.

Should anyone be interested in contributing to the development of this predictive feature or investigating this further, please feel free to add to the issues on our repository or contact us directly at [email protected].

Access model partitions and plot explanations

1partitioned_model.partitions
Out:

&#123;'4605': <xplainable.core.ml.regression.XRegressor at 0x2b9265a50>,

'3538': <xplainable.core.ml.regression.XRegressor at 0x2acfb4220>,

'4604': <xplainable.core.ml.regression.XRegressor at 0x2acfb5ed0>,

'3170': <xplainable.core.ml.regression.XRegressor at 0x2acfce980>,

'3362': <xplainable.core.ml.regression.XRegressor at 0x2b89a76d0>,

'3537': <xplainable.core.ml.regression.XRegressor at 0x2b8940160>,

'4603': <xplainable.core.ml.regression.XRegressor at 0x2b89b7df0>,

'3171': <xplainable.core.ml.regression.XRegressor at 0x2b8955ff0>,

'3357': <xplainable.core.ml.regression.XRegressor at 0x2b8943040>,

'3178': <xplainable.core.ml.regression.XRegressor at 0x2b9264490>,

'__dataset__': <xplainable.core.ml.regression.XRegressor at 0x2ad193850>&#125;

1partitioned_model.partitions['3170'].explain()
1# Create a model
2try:
3 model_id, version_id = client.models.create_model(
4 model=partitioned_model,
5 model_name="Dozer Partitioned Model",
6 model_description="Predicting the price of a different dozer types partitioned on ModelID",
7 x=X_train,
8 y=y_train
9 )
10except XplainableAPIError as e:
11 print(f"Error creating partitioned model: {e}")

Step 3: Deploy the Model

Deploy the model to make it available for inference. You’ll use the version ID returned from the model creation step to deploy this specific version.

1try:
2 deployment_response = client.deployments.deploy(
3 model_version_id=version_id #<- Use version id produced above
4 )
5 deployment_id = deployment_response.deployment_id
6except XplainableAPIError as e:
7 print(f"Error deploying partitioned model: {e}")

Step 4: Activate the Deployment

Activate the model deployment so that it’s ready to receive inference requests.

1try:
2 client.deployments.activate_deployment(deployment_id=deployment_id)
3except XplainableAPIError as e:
4 print(f"Error activating partitioned deployment: {e}")

Step 5: Generate a Deploy Key

Generate an API deploy key for secure access to the deployed model. This key will be used to authenticate when making prediction requests.

1try:
2 deploy_key = client.deployments.generate_deploy_key(
3 deployment_id=deployment_id,
4 name='API key for Dozer Price Prediction'
5 )
6 print(f"Deploy key created: {str(deploy_key)}")
7except XplainableAPIError as e:
8 print(f"Error generating deploy key: {e}")

Step 6: Format a Sample Input

Prepare a single test sample (excluding the target column SalePrice) to be used for model inference. This sample is converted to JSON format for use in an API call.

1body = json.loads(
2 df[df.ModelID == "4605"].drop(columns=["SalePrice"]).sample(1).to_json(orient="records")
3 )
4
1body
Out:

[&#123;'SalesID': 2644612,

'MachineID': 1877801,

'ModelID': '4605',

'datasource': 149,

'auctioneerID': 1.0,

'YearMade': 2001,

'MachineHoursCurrentMeter': 3834.0,

'UsageBand': 'Medium',

'fiModelDesc': '310G',

'fiBaseModel': '310',

'fiSecondaryDesc': 'G',

'fiModelSeries': None,

'fiModelDescriptor': None,

'ProductSize': None,

'fiProductClassDesc': 'Backhoe Loader - 14.0 to 15.0 Ft Standard Digging Depth',

'state': 'Nevada',

'ProductGroup': 'BL',

'ProductGroupDesc': 'Backhoe Loaders',

'DriveSystem': 'Two Wheel Drive',

'Enclosure': 'EROPS w AC',

'Forks': 'None or Unspecified',

'PadType': 'None or Unspecified',

'RideControl': 'No',

'Stick': 'Standard',

'Transmission': 'Standard',

'Turbocharged': 'None or Unspecified',

'BladeExtension': None,

'BladeWidth': None,

'EnclosureType': None,

'EngineHorsepower': None,

'Hydraulics': None,

'Pushblock': None,

'Ripper': None,

'Scarifier': None,

'TipControl': None,

'TireSize': None,

'Coupler': None,

'CouplerSystem': None,

'GrouserTracks': None,

'HydraulicsFlow': None,

'TrackType': None,

'UndercarriagePadWidth': None,

'StickLength': None,

'Thumb': None,

'PatternChanger': None,

'GrouserType': None,

'BackhoeMounting': None,

'BladeType': None,

'TravelControls': None,

'DifferentialType': None,

'SteeringControls': None,

'MfgYear': 2001.0,

'fiManufacturerID': 43,

'fiManufacturerDesc': 'John Deere',

'PrimarySizeBasis': 'Standard Digging Depth - Ft',

'PrimaryLower': 14.0,

'PrimaryUpper': 15.0,

'saleyear': 2011,

'salemonth': 6,

'saledayofweek': 'Friday'&#125;]

Step 7: Send a Prediction Request

Make a POST request to the Xplainable inference endpoint with the sample input. The deploy_key is included in the headers for authentication, and the model returns a prediction based on the JSON-formatted input data.

1response = requests.post(
2 url="https://inference.xplainable.io/v1/predict",
3 headers={'api_key': str(deploy_key)},
4 json=body
5)
6
7value = response.json()
8print("Prediction result:", value)