Lab: build a complete ML pipeline

Ingest, clean, engineer features, split, scale, train, evaluate, and persist — each step as a tested, reproducible function.

A notebook is not a pipeline. A pipeline is a sequence of composable, tested, deterministic functions that can be run again from scratch and produce the same output. This lab builds one, step by step.

Stage 1 — ingest and inspect

Python — editable, runs in your browser

Stage 2 — clean

Python — editable, runs in your browser

import numpy as np
import pandas as pd

def make_dataset(seed=42):
  rng = np.random.default_rng(seed)
  n = 500
  age    = rng.integers(18, 70, n).astype(float)
  income = rng.normal(55000, 18000, n)
  region = rng.choice(["north", "south", "east", "west"], n)
  missing_idx = rng.choice(n, 20, replace=False)
  income[missing_idx] = np.nan
  income[rng.integers(0, n, 5)] = rng.uniform(200000, 500000, 5)
  signal = (age < 40).astype(float) + (income > 55000).astype(float)
  y = (signal + rng.normal(0, 0.6, n) > 1).astype(int)
  return pd.DataFrame({"age": age, "income": income, "region": region, "label": y})

def clean(df: pd.DataFrame, income_cap: float = 150000) -> pd.DataFrame:
  """Impute missing income with median; cap extreme outliers."""
  result = df.copy()
  median_income = result["income"].median()
  result["income"] = result["income"].fillna(median_income)
  result["income"] = result["income"].clip(upper=income_cap)
  return result

# Test the cleaner
def test_clean_no_nulls():
  df = make_dataset(42)
  out = clean(df)
  assert out.isnull().sum().sum() == 0, "Nulls remain after cleaning"
  print("test_clean_no_nulls: PASSED")

def test_clean_no_mutation():
  df = make_dataset(42)
  original_null_count = df["income"].isnull().sum()
  _ = clean(df)
  assert df["income"].isnull().sum() == original_null_count, "Input mutated"
  print("test_clean_no_mutation: PASSED")

test_clean_no_nulls()
test_clean_no_mutation()

df_clean = clean(make_dataset(42))
print(f"\nAfter cleaning — missing values: {df_clean.isnull().sum().sum()}")
print(f"Income max after cap: {df_clean['income'].max():.0f}")

Stage 3 — feature engineering and split

Python — editable, runs in your browser

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

def clean(df, income_cap=150000):
  result = df.copy()
  result["income"] = result["income"].fillna(result["income"].median())
  result["income"] = result["income"].clip(upper=income_cap)
  return result

def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
  """Add an interaction feature; drop the label column."""
  result = df.copy()
  result["age_income"] = result["age"] * result["income"] / 1e6
  return result

raw   = make_dataset(42)
clean_df  = clean(raw)
feat_df   = engineer_features(clean_df)

X = feat_df.drop(columns=["label"])
y = feat_df["label"]

X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, stratify=y, random_state=42
)

print(f"Feature columns: {list(X.columns)}")
print(f"Train: {X_train.shape}  Test: {X_test.shape}")
print(f"Class balance (train): {np.bincount(y_train)}")

Stage 4 — build pipeline, tune, and evaluate

Python — editable, runs in your browser

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

def engineer_features(df):
  result = df.copy()
  result["age_income"] = result["age"] * result["income"] / 1e6
  return result

raw = make_dataset(42)
feat_df = engineer_features(clean(raw))
X = feat_df.drop(columns=["label"])
y = feat_df["label"]
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, stratify=y, random_state=42
)

numeric_cols     = ["age", "income", "age_income"]
categorical_cols = ["region"]

preprocessor = ColumnTransformer([
  ("num", StandardScaler(),              numeric_cols),
  ("cat", OneHotEncoder(drop="first"),   categorical_cols),
])

pipe = Pipeline([
  ("prep", preprocessor),
  ("clf",  DecisionTreeClassifier(random_state=42)),
])

grid = GridSearchCV(
  pipe,
  {"clf__max_depth": [3, 5, 7, 10], "clf__min_samples_leaf": [5, 10, 20]},
  cv=5, scoring="f1", n_jobs=-1,
)
grid.fit(X_train, y_train)

best = grid.best_estimator_
preds = best.predict(X_test)
cm = confusion_matrix(y_test, preds)

print("=== Pipeline evaluation report ===")
print(f"Best params:   {grid.best_params_}")
print(f"Best CV F1:    {grid.best_score_:.4f}")
print(f"Test accuracy: {accuracy_score(y_test, preds):.4f}")
print(f"Test F1:       {f1_score(y_test, preds):.4f}")
print(f"Confusion matrix:")
print(f"  TN={cm[0,0]}  FP={cm[0,1]}")
print(f"  FN={cm[1,0]}  TP={cm[1,1]}")

Stage 5 — persist the pipeline

Once you have the fitted pipeline, serialise it for later use. In a real project, this step writes to a versioned artefact store. Here it writes to a temporary file and verifies the predictions are identical after loading:

import joblib
import numpy as np

# Save
joblib.dump(best, "/tmp/churn_pipeline_v1.pkl")

# Load and verify
loaded = joblib.load("/tmp/churn_pipeline_v1.pkl")
assert np.allclose(best.predict(X_test), loaded.predict(X_test))
print("Serialisation verified.")

The complete pipeline — ColumnTransformer, scalers, encoders, and fitted tree — is bundled in a single object. Loading it on a different machine (with compatible sklearn/numpy versions) produces exactly the same predictions, which is the minimal requirement for a reproducible deployment.

Where to go next

You have completed the Advanced Data Science track. The skills built across these five modules — model selection, sklearn pipelines, rigorous evaluation, time series analysis, and pipeline engineering — form the core toolkit for taking analysis from notebook to production.

Finished reading? Mark it complete to track your progress.

Lab: build a complete ML pipeline

Stage 1 — ingest and inspect

Stage 2 — clean

Stage 3 — feature engineering and split

Stage 4 — build pipeline, tune, and evaluate

Stage 5 — persist the pipeline

Where to go next

On this page