CODE: OLS and ML Modeling with SHAP

Hot City, Heated Calls:
Understanding Extreme Heat and Quality of Life
Using New York City's 311 and SHAP

OLS regression, Random Forest modeling, and SHAP interpretability analysis.

← 7. EDA

Load the Data

Code Cell 1
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error

import matplotlib.pyplot as plt
import numpy as np
import os
Code Cell 2
df = pd.read_csv("data/model/Final_Data_Model.csv")
Code Cell 3
df.shape
(2225, 20)
Code Cell 4
# Socioeconomic and demographic predictors.
acs_predictors = [
    "PCT_BACHELORS_PLUS",
    "PCT_RENTERS",
    "PCT_LIMITED_ENGLISH",
    "MEDIAN_INCOME",
    "POVERTY_RATE",
    "PCT_NON_WHITE"
]

# Environmental predictors.
env_predictors = [
    "PCT_TREE_CANOPY",
    "PCT_IMPERVIOUS",
    "WCR",
    "NDVI"
]

# Urban form predictors.
# Parks were HIGHLY correlated to subway.
urban_predictors = [
    "BD",   # Building density.
    "AH",   # Weighted Average building height.
    "POI_500M_DENSITY", # Spatial feature
    "KNN_SUBWAY_dist_mean"
]

# Combined predictor set with all features.
all_predictors = env_predictors + acs_predictors + urban_predictors

# 2 targets
targets = ["heatweek_calls_per_1k", "normalweek_calls_per_1k"]

Data Cleaning

Population Distribution

Code Cell 5
df["TOTAL_POP_x"].hist(bins=40, figsize=(6,4))
plt.xlabel("KTOTAL_POP")
plt.ylabel("Count")
plt.show()
Figure 1

Targets Distribuiton

Code Cell 6
## Histogram
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Plot histograms
plt.figure(figsize=(12, 5))

# Histogram for heatweek_calls_per_1k
plt.subplot(1, 2, 1)
plt.hist(df["heatweek_calls_per_1k"], bins=200)
plt.xlabel("Calls per 1k Population")
plt.ylabel("Frequency")

# Histogram for normalweek_calls_per_1k
plt.subplot(1, 2, 2)
plt.hist(df["normalweek_calls_per_1k"], bins=200)
plt.title("Histogram of Normal Week Calls per 1k")
plt.xlabel("Calls per 1k Population")
plt.ylabel("Frequency")

plt.tight_layout()
plt.show()
Figure 1

Log Transform

Code Cell 7
## Log Transformation for Targets

# Plot histograms
plt.figure(figsize=(12, 5))

# Histogram for heatweek_calls_per_1k
plt.subplot(1, 2, 1)
plt.hist(np.log1p(df["heatweek_calls_per_1k"]), bins=100)
plt.xlabel("Logged Calls per 1k Population")
plt.ylabel("Frequency")

# Histogram for normalweek_calls_per_1k
plt.subplot(1, 2, 2)
plt.hist(np.log1p(df["normalweek_calls_per_1k"]), bins=100)
plt.title("Histogram of Normal Week Calls per 1k")
plt.xlabel("Logged Calls per 1k Population")
plt.ylabel("Frequency")

plt.tight_layout()
plt.show()
Figure 1

Clean the data with MEDIAN_INCOME < 0, and Total Tract Population < 500.

Code Cell 8
df = df[df["MEDIAN_INCOME"] > 0]
df = df[df["TOTAL_POP_x"] > 500]
df.shape
(2192, 20)

OLS

Code Cell 9
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 2000)

Code Cell 10
import statsmodels.api as sm


X = df[all_predictors]
X = (X - X.mean()) / X.std() ## Standardize features
# y = df["heatweek_calls_per_1k"]  
y = np.log1p(df["heatweek_calls_per_1k"])

X_const = sm.add_constant(X)
ols_sm = sm.OLS(y, X_const).fit()

print(ols_sm.summary())  
                              OLS Regression Results                             
=================================================================================
Dep. Variable:     heatweek_calls_per_1k   R-squared:                       0.088
Model:                               OLS   Adj. R-squared:                  0.082
Method:                    Least Squares   F-statistic:                     14.94
Date:                   Wed, 03 Dec 2025   Prob (F-statistic):           4.79e-35
Time:                           22:02:54   Log-Likelihood:                -1453.7
No. Observations:                   2192   AIC:                             2937.
Df Residuals:                       2177   BIC:                             3023.
Df Model:                             14                                         
Covariance Type:               nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    0.8027      0.010     79.747      0.000       0.783       0.822
PCT_TREE_CANOPY          0.0240      0.027      0.885      0.376      -0.029       0.077
PCT_IMPERVIOUS           0.1241      0.033      3.740      0.000       0.059       0.189
WCR                      0.0244      0.012      1.957      0.051   -5.37e-05       0.049
NDVI                    -0.0801      0.016     -4.943      0.000      -0.112      -0.048
PCT_BACHELORS_PLUS      -0.0154      0.021     -0.720      0.471      -0.057       0.027
PCT_RENTERS             -0.0202      0.017     -1.172      0.242      -0.054       0.014
PCT_LIMITED_ENGLISH      0.0009      0.011      0.082      0.935      -0.021       0.023
MEDIAN_INCOME            0.0299      0.020      1.531      0.126      -0.008       0.068
POVERTY_RATE            -0.0699      0.016     -4.443      0.000      -0.101      -0.039
PCT_NON_WHITE           -0.0399      0.014     -2.922      0.004      -0.067      -0.013
BD                      -0.0950      0.018     -5.250      0.000      -0.130      -0.060
AH                      -0.0457      0.013     -3.426      0.001      -0.072      -0.020
POI_500M_DENSITY        -0.0175      0.015     -1.201      0.230      -0.046       0.011
KNN_SUBWAY_dist_mean    -0.0104      0.014     -0.748      0.454      -0.038       0.017
==============================================================================
Omnibus:                      690.899   Durbin-Watson:                   1.582
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2744.943
Skew:                           1.495   Prob(JB):                         0.00
Kurtosis:                       7.594   Cond. No.                         8.67
==============================================================================

Notes:
[1] Standard E
... [output truncated]
Code Cell 11
X = df[all_predictors]
X = (X - X.mean()) / X.std() ## Standardize features
# y = df["normalweek_calls_per_1k"]  
y = np.log1p(df["normalweek_calls_per_1k"])

X_const = sm.add_constant(X)
ols_sm = sm.OLS(y, X_const).fit()

print(ols_sm.summary()) 
                               OLS Regression Results                              
===================================================================================
Dep. Variable:     normalweek_calls_per_1k   R-squared:                       0.084
Model:                                 OLS   Adj. R-squared:                  0.078
Method:                      Least Squares   F-statistic:                     14.22
Date:                     Wed, 03 Dec 2025   Prob (F-statistic):           3.78e-33
Time:                             22:03:01   Log-Likelihood:                -1379.4
No. Observations:                     2192   AIC:                             2789.
Df Residuals:                         2177   BIC:                             2874.
Df Model:                               14                                         
Covariance Type:                 nonrobust                                         
========================================================================================
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    0.8620      0.010     88.589      0.000       0.843       0.881
PCT_TREE_CANOPY          0.0532      0.026      2.033      0.042       0.002       0.105
PCT_IMPERVIOUS           0.1464      0.032      4.565      0.000       0.084       0.209
WCR                      0.0254      0.012      2.104      0.035       0.002       0.049
NDVI                    -0.0681      0.016     -4.347      0.000      -0.099      -0.037
PCT_BACHELORS_PLUS      -0.0105      0.021     -0.509      0.611      -0.051       0.030
PCT_RENTERS             -0.0087      0.017     -0.521      0.602      -0.041       0.024
PCT_LIMITED_ENGLISH     -0.0006      0.011     -0.056      0.956      -0.022       0.021
MEDIAN_INCOME            0.0309      0.019      1.638      0.102      -0.006       0.068
POVERTY_RATE            -0.0694      0.015     -4.563      0.000      -0.099      -0.040
PCT_NON_WHITE           -0.0305      0.013     -2.310      0.021      -0.056      -0.005
BD                      -0.0671      0.017     -3.835      0.000      -0.101      -0.033
AH                      -0.0529      0.013     -4.101      0.000      -0.078      -0.028
POI_500M_DENSITY        -0.0256      0.014     -1.819      0.069      -0.053       0.002
KNN_SUBWAY_dist_mean    -0.0225      0.013     -1.684      0.092      -0.049       0.004
==============================================================================
Omnibus:                      683.326   Durbin-Watson:                   1.591
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2766.578
Skew:                           1.470   Prob(JB):                         0.00
Kurtosis:                       7.652   Cond. No.                         8.67
==============================================================================

... [output truncated]

RF Model

Code Cell 12
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np
from tqdm.auto import tqdm

# ! pip install shap

Heatweek Model

Code Cell 13
# 1. Choose which target to model
target = "heatweek_calls_per_1k"  # 

# Log-transform the target to reduce skewness
df[target] = np.log1p(df[target])

X = df[all_predictors]
y = df[target]

# 2. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

# 3. Define RF pipeline (no scaler)
rf_pipeline = Pipeline([
    ("rf", RandomForestRegressor(random_state=42, n_jobs=-1))
])

# 4. Hyperparameter grid (small, reasonable ranges)
param_grid = {
    "rf__n_estimators": [200, 400, 600],
    "rf__max_depth": [10, 20, 30],
    "rf__min_samples_split": [2, 5, 10],
    "rf__min_samples_leaf": [1, 2, 4],
    "rf__max_features": ["auto", "sqrt", 0.5],
}

# 5. GridSearchCV with 3-fold CV, optimizing RMSE
grid = GridSearchCV(
    estimator=rf_pipeline,
    param_grid=param_grid,
    cv=3,
    # scoring="r2",  # use r2 to find the most fit model for explanation
    scoring="neg_root_mean_squared_error",  # sklearn uses negative for losses 
    n_jobs=-1,
    verbose=2
)

# 6. Fit on training data
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best CV RMSE:", -grid.best_score_)

# 7. Evaluate on test data using best model
best_rf_pipeline = grid.best_estimator_

y_pred_test = best_rf_pipeline.predict(X_test)

r2_test = r2_score(y_test, y_pred_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
mae_test = mean_absolute_error(y_test, y_pred_test)

print("\n=== Test set performance (Random Forest) ===")
print(f"R² (test):   {r2_test:.4f}")
print(f"RMSE (test): {rmse_test:.4f}")
print(f"MAE (test):  {mae_test:.4f}")
Fitting 3 folds for each of 243 candidates, totalling 729 fits
c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\model_selection\_validation.py:425: FitFailedWarning: 
243 fits failed out of a total of 729.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
193 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestRegressor must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto' instead.

--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestRegressor must be an int in the range [1, inf), a float in the range 
... [output truncated]
Best parameters: {'rf__max_depth': 30, 'rf__max_features': 0.5, 'rf__min_samples_leaf': 4, 'rf__min_samples_split': 2, 'rf__n_estimators': 400}
Best CV RMSE: 0.4428078581968727

=== Test set performance (Random Forest) ===
R² (test):   0.2458
RMSE (test): 0.4149
MAE (test):  0.3129
Code Cell 14
import shap
import matplotlib.pyplot as plt

# Initialize JS visualization (for notebooks)
shap.initjs()

# 1. Get the trained RandomForest model from the pipeline
rf_model = best_rf_pipeline.named_steps["rf"]

# 2. Convert full dataset into numpy array
X_full = X.values    # full data (train + test)

# 3. Build SHAP explainer
explainer = shap.TreeExplainer(rf_model)

# 4. Compute SHAP values for test set
shap_values = explainer.shap_values(X_full)  # shape: (n_samples, n_features)

# 5. SHAP summary plot (beeswarm)
plt.figure(figsize=(10, 6))
shap.summary_plot(
    shap_values,
    X,
    feature_names=all_predictors,
    show=True
)

# 6. SHAP bar plot (mean |SHAP| value)
plt.figure(figsize=(8, 6))
shap.summary_plot(
    shap_values,
    X,
    feature_names=all_predictors,
    plot_type="bar",
    show=True
)
📋 [Large table - see notebook]
Figure 1
Figure 2

Percentage Feature Importance

Code Cell 15
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# shap_values: shape = (n_samples, n_features)
# all_predictors: list of feature names

# 1. Compute mean absolute SHAP importance for each feature
mean_abs_shap = np.mean(np.abs(shap_values), axis=0)

# 2. Put into DataFrame for easy plotting
shap_df = pd.DataFrame({
    "feature": all_predictors,
    "importance": mean_abs_shap
})

# 3. Sort by importance descending
shap_df = shap_df.sort_values("importance", ascending=False)

# 4. Compute percentage contribution
shap_df["percentage"] = shap_df["importance"] / shap_df["importance"].sum() * 100

# 5. Plot bar chart with percentage label
plt.figure(figsize=(8, 10))
bars = plt.barh(shap_df["feature"], shap_df["importance"], color="steelblue")

plt.xlabel("Mean |SHAP value|", fontsize=12)
plt.title("SHAP Feature Importance (with percentage)", fontsize=14)

# 6. Add percentage labels to each bar
for i, (value, pct) in enumerate(zip(shap_df["importance"], shap_df["percentage"])):
    plt.text(
        value + shap_df["importance"].max() * 0.01,  # small offset
        i,
        f"{pct:.1f}%",
        va="center",
        fontsize=10
    )

plt.gca().invert_yaxis()  # highest at top
plt.tight_layout()
plt.show()
Figure 1

SHAP Plot for Feature of interest

Code Cell 16
import matplotlib.pyplot as plt

feature = "PCT_IMPERVIOUS"     #
idx = all_predictors.index(feature)  

plt.figure(figsize=(8,6))
plt.scatter(
    X[feature],         
    shap_values[:, idx],     
    alpha=0.4
)

plt.xlabel(feature)
plt.ylabel(f"SHAP value for {feature}")
plt.title(f"SHAP Scatter Plot for {feature}")
plt.grid(True)
plt.show()
Figure 1

Print and Save SHAP Scatter Plots for all features

Code Cell 17
# Directory where the SHAP scatter plots will be saved
save_dir = "images/SHAP2/shap_scatter_plots_heat"
os.makedirs(save_dir, exist_ok=True)

# Loop through each predictor to generate SHAP scatter plots
for feature in all_predictors:

    # Get the column index of the current feature in the SHAP values array
    idx = all_predictors.index(feature)

    # Create the scatter plot
    plt.figure(figsize=(8, 6))
    plt.scatter(
        X[feature],          # Raw feature values (global: train + test recommended)
        shap_values[:, idx], # Corresponding SHAP values
        alpha=0.45
    )

    # Axis labels and title
    plt.xlabel(feature, fontsize=12)
    plt.ylabel(f"SHAP value for {feature}", fontsize=12)
    plt.title(f"SHAP Scatter Plot for {feature}", fontsize=14)
    plt.grid(True)

    # Create a safe filename (avoid illegal characters)
    safe_name = feature.replace("/", "_").replace(" ", "_")
    filepath = os.path.join(save_dir, f"{safe_name}.png")

    # Save the figure
    plt.savefig(filepath, dpi=200, bbox_inches="tight")
    plt.close()

print("✓ All SHAP scatter plots have been saved to:", save_dir)
✓ All SHAP scatter plots have been saved to: images/SHAP2/shap_scatter_plots_heat

Regular heat week model

Code Cell 18
# 1. Choose which target to model
target2 = "normalweek_calls_per_1k"  

# Log-transform the target to reduce skewness
df[target2] = np.log1p(df[target2])

X = df[all_predictors]
y = df[target2]

# 2. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

# 3. Define RF pipeline (no scaler)
rf_pipeline = Pipeline([
    ("rf", RandomForestRegressor(random_state=42, n_jobs=-1))
])

# 4. Hyperparameter grid 
param_grid = {
    "rf__n_estimators": [200, 400, 600],
    "rf__max_depth": [10, 20, 30],
    "rf__min_samples_split": [2, 5, 10],
    "rf__min_samples_leaf": [1, 2, 4],
    "rf__max_features": ["auto", "sqrt", 0.5],
}

# 5. GridSearchCV with 3-fold CV, optimizing RMSE
grid = GridSearchCV(
    estimator=rf_pipeline,
    param_grid=param_grid,
    cv=3,
    ## scoring="r2", 
    scoring="neg_root_mean_squared_error",  # sklearn uses negative for losses 
    n_jobs=-1,
    verbose=2
)

# 6. Fit on training data
grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
print("Best CV RMSE:", -grid.best_score_)

# 7. Evaluate on test data using best model
best_rf_pipeline2 = grid.best_estimator_

y_pred_test = best_rf_pipeline2.predict(X_test)

r2_test = r2_score(y_test, y_pred_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
mae_test = mean_absolute_error(y_test, y_pred_test)

print("\n=== Test set performance (Random Forest) ===")
print(f"R² (test):   {r2_test:.4f}")
print(f"RMSE (test): {rmse_test:.4f}")
print(f"MAE (test):  {mae_test:.4f}")
Fitting 3 folds for each of 243 candidates, totalling 729 fits
c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\model_selection\_validation.py:425: FitFailedWarning: 
243 fits failed out of a total of 729.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
141 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestRegressor must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'sqrt', 'log2'} or None. Got 'auto' instead.

--------------------------------------------------------------------------------
102 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\DZM\.conda\envs\geospatial\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_features' parameter of RandomForestRegressor must be an int in the range [1, inf), a float in the range
... [output truncated]
Best parameters: {'rf__max_depth': 30, 'rf__max_features': 0.5, 'rf__min_samples_leaf': 4, 'rf__min_samples_split': 2, 'rf__n_estimators': 600}
Best CV RMSE: 0.21015411387026253

=== Test set performance (Random Forest) ===
R² (test):   0.2738
RMSE (test): 0.1940
MAE (test):  0.1537
Code Cell 19
import shap
import matplotlib.pyplot as plt

# Initialize JS visualization (for notebooks)
shap.initjs()

# 1. Get the trained RandomForest model from the pipeline
rf_model2 = best_rf_pipeline2.named_steps["rf"]

# 2. Use training data as background for SHAP
X_full = X.values  # TreeExplainer can work with numpy arrays

# 3. Build SHAP explainer
explainer = shap.TreeExplainer(rf_model2)

# 4. Compute SHAP values for test set
shap_values = explainer.shap_values(X_full)  # shape: (n_samples, n_features)

# 5. SHAP summary plot (beeswarm)
plt.figure(figsize=(10, 6))
shap.summary_plot(
    shap_values,
    X,
    feature_names=all_predictors,
    show=True
)

# 6. SHAP bar plot (mean |SHAP| value)
plt.figure(figsize=(8, 6))
shap.summary_plot(
    shap_values,
    X,
    feature_names=all_predictors,
    plot_type="bar",
    show=True
)
📋 [Large table - see notebook]
Figure 1
Figure 2

Percentage Feature Importance

Code Cell 20
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# shap_values: shape = (n_samples, n_features)
# all_predictors: list of feature names

# 1. Compute mean absolute SHAP importance for each feature
mean_abs_shap = np.mean(np.abs(shap_values), axis=0)

# 2. Put into DataFrame for easy plotting
shap_df = pd.DataFrame({
    "feature": all_predictors,
    "importance": mean_abs_shap
})

# 3. Sort by importance descending
shap_df = shap_df.sort_values("importance", ascending=False)

# 4. Compute percentage contribution
shap_df["percentage"] = shap_df["importance"] / shap_df["importance"].sum() * 100

# 5. Plot bar chart with percentage label
plt.figure(figsize=(8, 10))
bars = plt.barh(shap_df["feature"], shap_df["importance"], color="steelblue")

plt.xlabel("Mean |SHAP value|", fontsize=12)
plt.title("SHAP Feature Importance (with percentage) for Regular Heat Model", fontsize=14)

# 6. Add percentage labels to each bar
for i, (value, pct) in enumerate(zip(shap_df["importance"], shap_df["percentage"])):
    plt.text(
        value + shap_df["importance"].max() * 0.01,  # small offset
        i,
        f"{pct:.1f}%",
        va="center",
        fontsize=10
    )

plt.gca().invert_yaxis()  # highest at top
plt.tight_layout()
plt.show()
Figure 1

Print and Save SHAP Scatter Plots for all features

Code Cell 21
# Directory where the SHAP scatter plots will be saved
save_dir = "images/SHAP2/shap_scatter_plots_regular"
os.makedirs(save_dir, exist_ok=True)

# Loop through each predictor to generate SHAP scatter plots
for feature in all_predictors:

    # Get the column index of the current feature in the SHAP values array
    idx = all_predictors.index(feature)

    # Create the scatter plot
    plt.figure(figsize=(8, 6))
    plt.scatter(
        X[feature],          # Raw feature values (global: train + test recommended)
        shap_values[:, idx], # Corresponding SHAP values
        alpha=0.45
    )

    # Axis labels and title
    plt.xlabel(feature, fontsize=12)
    plt.ylabel(f"SHAP value for {feature}", fontsize=12)
    plt.title(f"SHAP Scatter Plot for {feature}", fontsize=14)
    plt.grid(True)

    # Create a safe filename (avoid illegal characters)
    safe_name = feature.replace("/", "_").replace(" ", "_")
    filepath = os.path.join(save_dir, f"{safe_name}.png")

    # Save the figure
    plt.savefig(filepath, dpi=200, bbox_inches="tight")
    plt.close()

print("✓ All SHAP scatter plots have been saved to:", save_dir)
✓ All SHAP scatter plots have been saved to: images/SHAP2/shap_scatter_plots_regular
Code Cell 22
import matplotlib.pyplot as plt

feature = "BD"     #
idx = all_predictors.index(feature)  

plt.figure(figsize=(8,6))
plt.scatter(
    X[feature],         
    shap_values[:, idx],     
    alpha=0.4
)

plt.xlabel(feature)
plt.ylabel(f"SHAP value for {feature}")
plt.title(f"SHAP Scatter Plot for {feature}")
plt.grid(True)
plt.show()
Figure 1
← 7. EDA

Notebooks

This Notebook

Source: 07_ols_ml.ipynb

Code Cells: 22

Figures: 11