Skip to main content
Version: Next

Classification - Customer Repurchase Window Prediction

Predicting customer repurchase behavior and timing using historical transaction data from an online retail business.

Dataset Source: Online Retail II UCI Dataset Problem Type: Classification Target Variable: Customer repurchase probability within specific time windows Use Case: Customer retention strategies, inventory management, targeted marketing campaigns

Package Imports

import pandas as pd
import xplainable as xp
from xplainable.core.models import XClassifier
from xplainable.core.optimisation.bayesian import XParamOptimiser
from xplainable.preprocessing.pipeline import XPipeline
from xplainable.preprocessing import transformers as xtf
from sklearn.model_selection import train_test_split
import requests
import json

# Additional imports specific to this example
import numpy as np
import datetime as dt
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

import xplainable_client
!pip install xplainable
!pip install xplainable-client

Xplainable Cloud Setup

# Initialize Xplainable Cloud client
client = xplainable_client.Client(
api_key="", # Add your API key from https://platform.xplainable.io/
)

Data Loading and Exploration

Load the Online Retail II dataset and perform basic data exploration.

import pandas as pd
import requests
from io import BytesIO

def load_online_retail_ii():
"""
Downloads the Online Retail II dataset directly from the UCI repository
and returns a single DataFrame combining both sheets.
"""
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00502/online_retail_II.xlsx"
r = requests.get(url)
r.raise_for_status() # fail early if we got a bad status

# read both year‐sheets and concatenate
xls = pd.ExcelFile(BytesIO(r.content))
df1 = pd.read_excel(xls, sheet_name="Year 2009-2010", parse_dates=["InvoiceDate"])
df2 = pd.read_excel(xls, sheet_name="Year 2010-2011", parse_dates=["InvoiceDate"])
df = pd.concat([df1, df2], ignore_index=True)

# cleanup exactly like you had before
df = df.dropna(subset=["Customer ID"])
df = df[(df.Price > 0) & (df.Quantity > 0)].copy()
df["Amount"] = df.Price * df.Quantity
return df

# usage
df = load_online_retail_ii()
df.head()
InvoiceStockCodeDescriptionQuantityInvoiceDatePriceCustomer IDCountryAmount
04894348504815CM CHRISTMAS GLASS BALL 20 LIGHTS122009-12-01 07:45:006.9513085United Kingdom83.4
148943479323PPINK CHERRY LIGHTS122009-12-01 07:45:006.7513085United Kingdom81
248943479323WWHITE CHERRY LIGHTS122009-12-01 07:45:006.7513085United Kingdom81
348943422041RECORD FRAME 7" SINGLE SIZE482009-12-01 07:45:002.113085United Kingdom100.8
448943421232STRAWBERRY CERAMIC TRINKET BOX242009-12-01 07:45:001.2513085United Kingdom30

The timeline below illustrates the core problem the model is solving: will a customer place another order within 30 days of a given purchase? Each row represents an individual customer (C1 – C4), and every blue dot marks one of their historical purchases. From each purchase, a magenta line extends 30 days—the evaluation window used to create the training label. When a follow-up order actually arrives inside that window, it is highlighted with a pink star. Purchases followed by a star are the positive cases (“repurchased”), while those without a star are negative. Visually stepping through these tracks makes it clear how the dataset converts raw transactions into a binary outcome that the model can learn to predict.

# --- 1. LOAD & CLEAN -------------------------------------------------
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"], dayfirst=True, errors="coerce")

df = df.dropna(subset=["Customer ID", "InvoiceDate"])
df = df[(df["Price"] > 0) & (df["Quantity"] > 0)].copy()

df["Amount"] = df["Price"] * df["Quantity"]
df["InvoiceMonth"] = df["InvoiceDate"].dt.to_period("M")

1. Data Preprocessing

Data Preview and Initial Exploration

df.head()
InvoiceStockCodeDescriptionQuantityInvoiceDatePriceCustomer IDCountryAmountInvoiceMonth
04894348504815CM CHRISTMAS GLASS BALL 20 LIGHTS122009-12-01 07:45:006.9513085United Kingdom83.42009-12
148943479323PPINK CHERRY LIGHTS122009-12-01 07:45:006.7513085United Kingdom812009-12
248943479323WWHITE CHERRY LIGHTS122009-12-01 07:45:006.7513085United Kingdom812009-12
348943422041RECORD FRAME 7" SINGLE SIZE482009-12-01 07:45:002.113085United Kingdom100.82009-12
448943421232STRAWBERRY CERAMIC TRINKET BOX242009-12-01 07:45:001.2513085United Kingdom302009-12

RFM Feature Engineering

# Sort by customer and invoice date
df_sorted = df.sort_values(["Customer ID", "InvoiceDate"])

# Track the most recent purchase for each row
df_sorted["LastPurchase"] = (
df_sorted.groupby("Customer ID")["InvoiceDate"].shift()
)

# Add InvoiceMonth and MonthEnd again (safe even if already set)
df_sorted["InvoiceMonth"] = df_sorted["InvoiceDate"].dt.to_period("M")
df_sorted["MonthEnd"] = df_sorted["InvoiceMonth"].dt.to_timestamp("M")

# Keep only the last purchase as of each month
last_purchase = (
df_sorted.dropna(subset=["LastPurchase"])
.groupby(["Customer ID", "InvoiceMonth"])["LastPurchase"]
.max()
.reset_index()
)

# Create the monthly feature matrix (grp)
grp = (
df.groupby(["Customer ID", "InvoiceMonth"])
.agg({
"Invoice": "nunique", # Frequency
"Quantity": "sum", # DistinctItems or total quantity
"Amount": "sum", # Monetary
"Country": "first", # Keep Country
})
.rename(columns={
"Invoice": "Frequency",
"Quantity": "DistinctItems",
"Amount": "Monetary"
})
.reset_index()
)

# Ensure InvoiceMonth is period type
grp["InvoiceMonth"] = grp["InvoiceMonth"].astype("period[M]")

# Merge last purchase dates and calculate Recency
grp = grp.merge(last_purchase, on=["Customer ID", "InvoiceMonth"], how="left")
grp["MonthEnd"] = grp["InvoiceMonth"].dt.to_timestamp("M")
grp["Recency"] = (grp["MonthEnd"] - grp["LastPurchase"]).dt.days
grp.drop(columns=["LastPurchase"], inplace=True)

# Add Month and Quarter for time-based grouping or encoding
grp["Month"] = grp["InvoiceMonth"].dt.month
grp["Quarter"] = grp["InvoiceMonth"].dt.quarter

Build 30-day Repurchase Label

from pandas.tseries.offsets import Day

# Set the window size
DAYS = 30 # Change to 30 or 90 if needed

# Step 1: Unique (Customer ID, InvoiceDate) combinations
invoice_dates = df[["Customer ID", "InvoiceDate"]].drop_duplicates().copy()

# Step 2: Function to check if there's a purchase within N days
def has_purchase_within_n_days(row):
cid, date = row["Customer ID"], row["InvoiceDate"]
future_txns = invoice_dates[
(invoice_dates["Customer ID"] == cid) &
(invoice_dates["InvoiceDate"] > date) &
(invoice_dates["InvoiceDate"] <= date + Day(DAYS))
]
return 1 if len(future_txns) > 0 else 0

# Step 3: Apply the function row-wise (can take 30s+ on large data)
invoice_dates[f"rebuy_{DAYS}d"] = invoice_dates.apply(has_purchase_within_n_days, axis=1)

# Step 4: Convert to monthly and aggregate to get the label
invoice_dates["InvoiceMonth"] = invoice_dates["InvoiceDate"].dt.to_period("M")

label = (
invoice_dates.groupby(["Customer ID", "InvoiceMonth"])
[f"rebuy_{DAYS}d"].max()
.reset_index()
.rename(columns={f"rebuy_{DAYS}d": f"will_rebuy_{DAYS}d"})
)

# Step 5: Merge with feature matrix
data = grp.merge(label, on=["Customer ID", "InvoiceMonth"], how="left")
data[f"will_rebuy_{DAYS}d"].fillna(0, inplace=True)
data[f"will_rebuy_{DAYS}d"] = data[f"will_rebuy_{DAYS}d"].astype(int)

data[f"will_rebuy_{DAYS}d"].value_counts()
Out:

0 16500

1 9095

Name: will_rebuy_30d, dtype: int64

Train/Test Time-based Split

# --- 4. TIME-BASED SPLIT & MODEL (DYNAMIC DAYS, NO ONE-HOT) --------

data["Date"] = data["InvoiceMonth"].dt.to_timestamp()

train = data[data["Date"] < "2011-07-01"]
test = data[data["Date"] >= "2011-07-01"]

label_col = f"will_rebuy_{DAYS}d"

X_train = train.drop(columns=[label_col, "InvoiceMonth", "Date", "MonthEnd", "Customer ID"])
y_train = train[label_col]
X_test = test.drop(columns=[label_col, "InvoiceMonth", "Date", "MonthEnd", "Customer ID"])
y_test = test[label_col]
X_train
FrequencyDistinctItemsMonetaryCountryRecencyMonthQuarter
0526113.50United Kingdom12.0124
142090.00United Kingdom16.011
21527.05United Kingdom28.031
3119142.31United Kingdom1.062
417421577183.60United Kingdom216.011
........................
255891494833.48United Kingdom10.083
2559017321071.61United Kingdom13.052
255912508892.60United Kingdom8.093
255921187381.50United Kingdom7.0114
255931488765.28United Kingdom8.052

2. Model Optimization

opt = XParamOptimiser()
params = opt.optimise(X_train, y_train)
Out:

100%|██████████| 30/30 [00:08<00:00, 3.60trial/s, best loss: -0.8776764727397712]

3. Model Training

model = XClassifier(**params)
model.fit(X_train, y_train)
Out:

<xplainable.core.ml.classification.XClassifier at 0x2a566e140>

4. Model Interpretability and Explainability

model.explain()

7. Model Testing

Hold-out Evaluation

model.evaluate(X_test, y_test)
Out:

{'confusion_matrix': [[4333, 25], [823, 1612]],

'classification_report': {'0': {'precision': 0.8403801396431342,

'recall': 0.9942634235888022,

'f1-score': 0.9108681942400673,

'support': 4358.0},

'1': {'precision': 0.984728161270617,

'recall': 0.6620123203285421,

'f1-score': 0.7917485265225933,

'support': 2435.0},

'accuracy': 0.8751656116590608,

'macro avg': {'precision': 0.9125541504568756,

'recall': 0.8281378719586721,

'f1-score': 0.8513083603813303,

'support': 6793.0},

'weighted avg': {'precision': 0.892122732409647,

'recall': 0.8751656116590608,

'f1-score': 0.868168887469561,

'support': 6793.0}},

'roc_auc': 0.8699849600395034,

'neg_brier_loss': 0.8800487167761432,

'log_loss': 0.4091278987464692,

'cohen_kappa': 0.7074258976095472}

5. Model Persistence

model_id = client.create_model(
model=model,
model_name = "Customer Repurchase - 30 Day Forecast",
model_description = "Predicts whether a customer will make another purchase within 30 days based on their recent order behaviour and RFM features.",
x=X_train,
y=y_train
)

6. Model Deployment

deployment = client.deploy(
model_version_id=model_id["version_id"] #<- Use version id produced above
)
client.activate_deployment(deployment['deployment_id'])
deploy_key = client.generate_deploy_key(deployment['deployment_id'], 'Deployment API for Purchase Prediction', 7)

Generate Example Payload

#Set the option to highlight multiple ways of creating data
option = 2
if option == 1:
body = client.generate_example_deployment_payload(deployment['deployment_id'])
else:
body = json.loads(train.drop(columns=[label_col, "InvoiceMonth", "Date", "MonthEnd", "Customer ID"]).sample(1).to_json(orient="records"))
body

Call Inference Endpoint

response = requests.post(
url="https://inference.xplainable.io/v1/predict",
headers={'api_key': deploy_key['deploy_key']},
json=body
)

value = response.json()
value