%matplotlib inline
%load_ext autoreload
%autoreload 2


import time
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Settings for plots
plt.rcParams['figure.figsize'] = [10, 7]
plt.rcParams['font.size'] = 15

import automlx
from automlx import init


X, y = fetch_california_housing(return_X_y=True)
ds = fetch_california_housing(return_X_y=False)
df = pd.concat([pd.DataFrame(X, columns=ds.feature_names),
                pd.DataFrame(y.ravel(), columns=['Median Price'])], axis=1)

target_col='Median Price'
df.shape

(20640, 9)


df.head()


# drop unlabeled data in train/test dataset
df = df[df[target_col].notna()]


df[target_col]
fig = ff.create_distplot([df[target_col]], group_labels=[target_col], show_hist=False, show_rug=False)
fig.update_layout(  
                  xaxis_title=target_col,
                yaxis_title="Density", 
                showlegend=False)
fig.update_xaxes()
fig.show()


X_full = df.drop(target_col, axis=1)
y_full = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.3, random_state=7)

X_train.shape, X_test.shape

((14448, 8), (6192, 8))


init(engine='ray')

[2025-05-22 05:35:26,721] [automlx.backend] Overwriting ray session directory to /tmp/7dliinvt/ray, which will be deleted at engine shutdown. If you wish to retain ray logs, provide _temp_dir in ray_setup dict of engine_opts when initializing the AutoMLx engine.


est1 = automlx.Pipeline(task='regression')
est1.fit(X_train, y_train)

[2025-05-22 05:35:30,856] [automlx.interface] Dataset shape: (14448,8)
[2025-05-22 05:35:33,431] [sanerec.autotuning.parameter] Hyperparameter epsilon autotune range is set to its validation range. This could lead to long training times
[2025-05-22 05:35:33,745] [sanerec.autotuning.parameter] Hyperparameter repeat_quality_threshold autotune range is set to its validation range. This could lead to long training times
[2025-05-22 05:35:33,752] [sanerec.autotuning.parameter] Hyperparameter scope autotune range is set to its validation range. This could lead to long training times
[2025-05-22 05:35:33,823] [automlx.data_transform] Running preprocessing. Number of features: 9
[2025-05-22 05:35:33,985] [automlx.data_transform] Preprocessing completed. Took 0.162 secs
[2025-05-22 05:35:34,029] [automlx.process] Running Model Generation
[2025-05-22 05:35:34,077] [automlx.process] KNeighborsRegressor is disabled. The KNeighborsRegressor model is only recommended for datasets with less than 10000 samples and 1000 features.
[2025-05-22 05:35:34,078] [automlx.process] SVR is disabled. The SVR model is only recommended for datasets with less than 10000 samples and 1000 features.
[2025-05-22 05:35:34,079] [automlx.process] Model Generation completed.
[2025-05-22 05:35:34,148] [automlx.model_selection] Running Model Selection
[2025-05-22 05:36:14,592] [automlx.model_selection] Model Selection completed - Took 40.444 sec - Selected models: [['LGBMRegressor']]
[2025-05-22 05:36:14,633] [automlx.adaptive_sampling] Running Adaptive Sampling. Dataset shape: (14448,9).
[2025-05-22 05:36:17,686] [automlx.trials] Adaptive Sampling completed - Took 3.0524 sec.
[2025-05-22 05:36:17,782] [automlx.feature_selection] Starting feature ranking for LGBMRegressor
[2025-05-22 05:36:26,157] [automlx.feature_selection] Feature Selection completed. Took 8.402 secs.
[2025-05-22 05:36:26,215] [automlx.trials] Running Model Tuning for ['LGBMRegressor']
[2025-05-22 05:37:18,586] [automlx.trials] Best parameters for LGBMRegressor: {'num_leaves': 31, 'boosting_type': 'gbdt', 'subsample': 1, 'colsample_bytree': 0.7952797110155084, 'max_depth': 63, 'reg_alpha': 0, 'reg_lambda': 0, 'n_estimators': 377, 'learning_rate': 0.1, 'min_child_weight': 0.001}
[2025-05-22 05:37:18,587] [automlx.trials] Model Tuning completed. Took: 52.372 secs
[2025-05-22 05:37:31,673] [automlx.interface] Re-fitting pipeline
[2025-05-22 05:37:31,687] [automlx.final_fit] Skipping updating parameter seed, already fixed by FinalFit_5653c38b-7
[2025-05-22 05:37:33,814] [automlx.interface] AutoMLx completed.

<automlx._interface.regressor.AutoRegressor at 0x14bb351c0970>


y_pred = est1.predict(X_test)
score_default = mean_squared_error(y_test, y_pred)

print(f'Mean squared error on test data : {score_default}')

Mean squared error on test data : 0.2007340045930796


est1.print_summary()


# Each trial is a row in a dataframe that contains
# Algorithm, Number of Samples, Number of Features, Hyperparameters, Score, Runtime, Memory Usage, Step as features
trials = est1.completed_trials_summary_[est1.completed_trials_summary_["Step"].str.contains('Model Selection')]
name_of_score_column = f"Score ({est1._inferred_score_metric[0].name})"
trials.replace([np.inf, -np.inf], np.nan, inplace=True)
trials.dropna(subset=[name_of_score_column], inplace = True)
scores = trials[name_of_score_column].tolist()
models = trials['Algorithm'].tolist()

y_margin = 0.10 * (max(scores) - min(scores))
s = pd.Series(scores, index=models).sort_values(ascending=False)

colors = []
for f in s.keys():
    if f.strip() == est1.selected_model_.strip():
        colors.append('orange')
    elif s[f] >= s.mean():
        colors.append('teal')
    else:
        colors.append('turquoise')

fig, ax = plt.subplots(1)
ax.set_title("Algorithm Selection Trials")
ax.set_ylim(min(scores) - y_margin, max(scores) + y_margin)
ax.set_ylabel(est1._inferred_score_metric[0].name)
s.plot.bar(ax=ax, color=colors, edgecolor='black')
ax.axhline(y=s.mean(), color='black', linewidth=0.5)
plt.show()


# Each trial is a row in a dataframe that contains
# Algorithm, Number of Samples, Number of Features, Hyperparameters, Score, Runtime, Memory Usage, Step as features
trials = est1.completed_trials_summary_[est1.completed_trials_summary_["Step"].str.contains('Adaptive Sampling')]
trials.replace([np.inf, -np.inf], np.nan, inplace=True)
trials.dropna(subset=[name_of_score_column], inplace = True)
scores = trials[name_of_score_column].tolist()
n_samples = trials['# Samples'].tolist()

y_margin = 0.10 * (max(scores) - min(scores))
fig, ax = plt.subplots(1)
ax.set_title("Adaptive Sampling ({})".format(est1.selected_model_))
ax.set_xlabel('Dataset sample size')
ax.set_ylabel(est1._inferred_score_metric[0].name)
ax.grid(color='g', linestyle='-', linewidth=0.1)
ax.set_ylim(min(scores) - y_margin, max(scores) + y_margin)
ax.plot(n_samples, scores, 'k:', marker="s", color='teal', markersize=3)
plt.show()


print(f"Features selected: {est1.selected_features_names_}")
dropped_features = df.drop(est1.selected_features_names_raw_, axis=1).columns
print(f"Features dropped: {dropped_features.to_list()}")

# Each trial is a row in a dataframe that contains
# Algorithm, Number of Samples, Number of Features, Hyperparameters, Score, Runtime, Memory Usage, Step as features
trials = est1.completed_trials_summary_[est1.completed_trials_summary_["Step"].str.contains('Feature Selection')]
trials.replace([np.inf, -np.inf], np.nan, inplace=True)
trials.dropna(subset=[name_of_score_column], inplace = True)
trials.sort_values(by=['# Features'],ascending=True, inplace = True)
scores = trials[name_of_score_column].tolist()
n_features = trials['# Features'].tolist()

y_margin = 0.10 * (max(scores) - min(scores))
fig, ax = plt.subplots(1)
ax.set_title("Feature Selection Trials")
ax.set_xlabel("Number of Features")
ax.set_ylabel(est1._inferred_score_metric[0].name)
ax.grid(color='g', linestyle='-', linewidth=0.1)
ax.set_ylim(min(scores) - y_margin, max(scores) + y_margin)
ax.plot(n_features, scores, 'k:', marker="s", color='teal', markersize=3)
ax.axvline(x=len(est1.selected_features_names_), color='orange', linewidth=2.0)
plt.show()

Features selected: ['AveOccup', 'AveRooms', 'HouseAge', 'Latitude', 'Longitude', 'MedInc']
Features dropped: ['AveBedrms', 'Population', 'Median Price']


# Each trial is a row in a dataframe that contains
# Algorithm, Number of Samples, Number of Features, Hyperparameters, Score, Runtime, Memory Usage, Step as features
trials = est1.completed_trials_summary_[est1.completed_trials_summary_["Step"].str.contains('Model Tuning')]
trials.replace([np.inf, -np.inf], np.nan, inplace=True)
trials.dropna(subset=[name_of_score_column], inplace = True)
trials.drop(trials[trials['Finished'] == -1].index, inplace = True)
trials['Finished']= trials['Finished'].apply(lambda x: time.mktime(datetime.datetime.strptime(x,
                                             "%a %b %d %H:%M:%S %Y").timetuple()))
trials.sort_values(by=['Finished'],ascending=True, inplace = True)
scores = trials[name_of_score_column].tolist()
score = []
score.append(scores[0])
for i in range(1,len(scores)):
    if scores[i]>= score[i-1]:
        score.append(scores[i])
    else:
        score.append(score[i-1])
y_margin = 0.10 * (max(score) - min(score))
fig, ax = plt.subplots(1)
ax.set_title("Hyperparameter Tuning Trials")
ax.set_xlabel("Iteration $n$")
ax.set_ylabel(est1._inferred_score_metric[0].name)
ax.grid(color='g', linestyle='-', linewidth=0.1)
ax.set_ylim(min(score) - y_margin, max(score) + y_margin)
ax.plot(range(1, len(trials) + 1), score, 'k:', marker="s", color='teal', markersize=3)
plt.show()


custom_pipeline = automlx.Pipeline(
    task='regression',           
    model_list=[                 # Specify the models you want the AutoMLx to consider
        'LinearRegression',
        'AdaBoostRegressor',
        'XGBRegressor'
    ],
    n_algos_tuned=2,             # Choose how many models to tune
    min_features=1.0,            # Specify minimum features to force the model to use. It can take 3 possible types of values:
                                 # If int, 0 < min_features <= n_features,
                                 # If float, 0 < min_features <= 1.0, 1.0 means disabling feature selection
                                 # If list, names of features to keep, for example ['a', 'b'] means keep features 'a' and 'b'

    adaptive_sampling=False,     # Disable or enable Adaptive Sampling step. Default to `True`
    preprocessing=True,          # Disable or enable Preprocessing step. Default to `True`
    search_space={               # You can specify the hyper-parameters and ranges we search
        'AdaBoostRegressor': {
            "n_estimators": {"range": [10, 20], "type": "discrete"}
        },
    },
    max_tuning_trials=2,         # The maximum number of tuning trials. Can be integer or Dict (max number for each model)
    score_metric='r2',           # Any scikit-learn metric or a custom function
)


X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.7, random_state=0)


custom_pipeline.fit(
    X_train,
    y_train,
    X_val,
    y_val,
    time_budget= 20,    # Specify time budget in seconds
    cv='auto'           # Automatically pick a good cross-validation (cv) strategy for the user's dataset.
                        # Ignored if X_valid and y_valid are provided.
                        # Can also be:
                        #   - An integer (For example, to use 5-fold cross validation)
                        #   - A list of data indices to use as splits (for advanced, such as time-based splitting)
)
y_pred = custom_pipeline.predict(X_test)
score_modellist = mean_squared_error(y_test, y_pred)

print(f'Prediction error (MSE) on test data : {score_modellist}')

[2025-05-22 05:37:37,569] [automlx.interface] Dataset shape: (14448,8)
[2025-05-22 05:37:37,621] [automlx.interface] Adaptive Sampling disabled.
[2025-05-22 05:37:37,661] [automlx.data_transform] Running preprocessing. Number of features: 9
[2025-05-22 05:37:37,814] [automlx.data_transform] Preprocessing completed. Took 0.153 secs
[2025-05-22 05:37:37,840] [automlx.process] Running Model Generation
[2025-05-22 05:37:37,891] [automlx.process] Model Generation completed.
[2025-05-22 05:37:37,922] [automlx.model_selection] Running Model Selection
[2025-05-22 05:37:39,276] [automlx.model_selection] Model Selection completed - Took 1.355 sec - Selected models: [['XGBRegressor', 'LinearRegression']]
[2025-05-22 05:37:39,351] [automlx.trials] Running Model Tuning for ['XGBRegressor']
[2025-05-22 05:37:41,796] [automlx.trials] Best parameters for XGBRegressor: {'n_estimators': 100, 'min_child_weight': 1, 'reg_alpha': 0, 'booster': 'dart', 'max_depth': 6, 'learning_rate': 0.1, 'reg_lambda': 1}
[2025-05-22 05:37:41,797] [automlx.trials] Model Tuning completed. Took: 2.446 secs
[2025-05-22 05:37:41,851] [automlx.trials] skipping model tuning for: [<automlx._pipeline.pipeline.Pipeline object at 0x14b3269e0a90>]
[2025-05-22 05:37:41,948] [automlx.interface] Re-fitting pipeline
[2025-05-22 05:37:41,964] [automlx.final_fit] Skipping updating parameter seed, already fixed by FinalFit_20acbc6f-2
[2025-05-22 05:37:43,400] [automlx.interface] AutoMLx completed.
Prediction error (MSE) on test data : 0.23716484748805952


explainer = automlx.MLExplainer(est1,
                              X_train,
                              y_train,
                              task="regression")


result_explain_model_default = explainer.explain_model(
    n_iter=5,                            # Can also be 'auto' to pick a good value for the explainer and task

    scoring_metric='r2',  # Global feature importance measures how much each feature improved the
                                         # model's score. Users can chose the scoring metric used here.
)


result_explain_model_default.to_dataframe()


result_explain_model_default.show_in_notebook()


result_explain_feature_dependence_default = explainer.explain_feature_dependence('Latitude')
result_explain_feature_dependence_default.show_in_notebook()


result_explain_feature_dependence_default.show_in_notebook(ice=True)


result_explain_feature_dependence_default = explainer.explain_feature_dependence(['Latitude', 'Longitude'])
result_explain_feature_dependence_default.show_in_notebook()


result_explain_prediction_default = explainer.explain_prediction(X_train.sample(n=10))
result_explain_prediction_default[0].show_in_notebook()


alfi = explainer.aggregate(explanations=result_explain_prediction_default)
alfi.show_in_notebook()


explainer.explore_whatif(X_test, y_test)


scikit_model = LinearRegression()
scikit_model.fit(X_train, y_train)

explainer_sklearn = automlx.MLExplainer(
                              scikit_model,
                              X_train,
                              y_train,
                              target_names=[             # Used for plot labels/legends.
                                  'Median Price'
                              ],
                              selected_features='auto',  # These features are used by the model; automatically inferred for AutoML Pipelines,
                              task="regression",         
                              col_types=None             # Specify type of features
                             )


explainer_sklearn.configure_explain_prediction(tabulator_type="kernel_shap")


explainer_sklearn.configure_explain_prediction(
                                               explainer_type='surrogate',
                                               method='lime'
                                              )


index = 0
result_explain_prediction_kernel_shap = explainer_sklearn.explain_prediction(X_train.iloc[index:index+1, :])
result_explain_prediction_kernel_shap[0].show_in_notebook()


explainer_sklearn.configure_explain_model(evaluator_type="observational")


result_explain_model_kernel_shap = explainer_sklearn.explain_model()
result_explain_model_kernel_shap.show_in_notebook()


explainer_sklearn.configure_explain_prediction(evaluator_type="observational",
                                              tabulator_type="permutation")

[2025-05-22 05:38:07,469] [automlx.mlx] AutoMLx got an unexpected keyword argument 'evaluator_type', which is not a configurable attribute of any of ['TabularLocalSurrogateExplainer', 'RandomSampleGeneration', 'DistanceWeighting', 'SurrogateHandler'].
Valid options are:
{'TabularLocalSurrogateExplainer': ['method', 'exp_sorting', 'num_features', 'scale_weight'], 'RandomSampleGeneration': ['discretizer', 'num_samples', 'sample_around_instance'], 'DistanceWeighting': ['kernel_width', 'distance_metric', 'dataset_type'], 'SurrogateHandler': ['model', 'feature_selection', 'force_fit_sample']}
[2025-05-22 05:38:07,469] [automlx.mlx] AutoMLx got an unexpected keyword argument 'tabulator_type', which is not a configurable attribute of any of ['TabularLocalSurrogateExplainer', 'RandomSampleGeneration', 'DistanceWeighting', 'SurrogateHandler'].
Valid options are:
{'TabularLocalSurrogateExplainer': ['method', 'exp_sorting', 'num_features', 'scale_weight'], 'RandomSampleGeneration': ['discretizer', 'num_samples', 'sample_around_instance'], 'DistanceWeighting': ['kernel_width', 'distance_metric', 'dataset_type'], 'SurrogateHandler': ['model', 'feature_selection', 'force_fit_sample']}


index = 0
result_explain_prediction_kernel_shap = explainer_sklearn.explain_prediction(X_train.iloc[index:index+1, :])
result_explain_prediction_kernel_shap[0].show_in_notebook()


explainer_sklearn.configure_explain_feature_dependence(explanation_type='ale')
result_explain_feature_dependence_default = explainer_sklearn.explain_feature_dependence(['Latitude', 'Longitude'])
result_explain_feature_dependence_default.show_in_notebook()

	MedInc	HouseAge	AveRooms	AveBedrms	Population	AveOccup	Latitude	Longitude	Median Price
0	8.3252	41.0	6.984127	1.023810	322.0	2.555556	37.88	-122.23	4.526
1	8.3014	21.0	6.238137	0.971880	2401.0	2.109842	37.86	-122.22	3.585
2	7.2574	52.0	8.288136	1.073446	496.0	2.802260	37.85	-122.24	3.521
3	5.6431	52.0	5.817352	1.073059	558.0	2.547945	37.85	-122.25	3.413
4	3.8462	52.0	6.281853	1.081081	565.0	2.181467	37.85	-122.25	3.422

Step	# Samples	# Features	Algorithm	Hyperparameters	Score (neg_mean_squared_error)	Runtime (Seconds)	Memory Usage (GB)	Finished
Model Selection	11559	8	LGBMRegressor	{'num_leaves': 31, 'boosting_type': 'gbdt', 'subsample': 1, 'colsample_bytree': 1, 'max_depth': 63, 'reg_alpha': 0, 'reg_lambda': 0, 'n_estimators': 100, 'learning_rate': 0.1, 'min_child_weight': 0.001}	-0.2183	5.3076	0.3075	Thu May 22 05:35:44 2025
Model Selection	11559	8	XGBRegressor	{'n_estimators': 100, 'min_child_weight': 1, 'reg_alpha': 0, 'booster': 'gbtree', 'max_depth': 6, 'learning_rate': 0.1, 'reg_lambda': 1}	-0.2283	16.4569	0.3459	Thu May 22 05:35:56 2025
Model Selection	11559	8	RandomForestRegressor	{'n_estimators': 100, 'min_samples_split': 0.0003, 'min_samples_leaf': 0.00015, 'max_features': 0.777777778}	-0.2585	34.3275	0.3597	Thu May 22 05:35:48 2025
Model Selection	11559	8	ExtraTreesRegressor	{'n_estimators': 100, 'min_samples_split': 0.00125, 'min_samples_leaf': 0.000625, 'max_features': 0.777777778}	-0.2983	6.5971	0.2965	Thu May 22 05:35:43 2025
Model Selection	11559	8	DecisionTreeRegressor	{'min_samples_split': 0.004, 'min_samples_leaf': 0.002, 'max_features': 1.0}	-0.3877	3.4246	0.2808	Thu May 22 05:35:42 2025
Model Selection	11559	8	LinearRegression	{}	-0.5296	0.7690	0.3102	Thu May 22 05:35:43 2025
Model Selection	11559	8	AdaBoostRegressor	{'learning_rate': 0.667, 'n_estimators': 50}	-0.701	13.9089	0.2777	Thu May 22 05:35:42 2025
Model Selection	11559	8	LinearSVR	{'C': 1.0}	-1.6878	6.5363	0.3188	Thu May 22 05:35:44 2025
Model Selection	11558	8	TorchMLPRegressor	{'optimizer_class': 'Adam', 'shuffle_dataset_each_epoch': True, 'optimizer_params': {}, 'criterion_class': None, 'criterion_params': {}, 'scheduler_class': None, 'scheduler_params': {}, 'batch_size': 128, 'lr': 0.001, 'epochs': 18, 'input_transform': 'auto', 'tensorboard_dir': None, 'use_tqdm': None, 'prediction_batch_size': 128, 'prediction_input_transform': 'auto', 'shuffling_buffer_size': None, 'depth': 4, 'num_logits': 1000, 'div_factor': 2, 'activation': 'ReLU', 'dropout': 0.1}	-3.1322	256.0426	0.6121	Thu May 22 05:36:14 2025
Adaptive Sampling	11559	8	AdaptiveSamplingStage_LGBMRegressor	{'num_leaves': 31, 'boosting_type': 'gbdt', 'subsample': 1, 'colsample_bytree': 1, 'max_depth': 63, 'reg_alpha': 0, 'reg_lambda': 0, 'n_estimators': 100, 'learning_rate': 0.1, 'min_child_weight': 0.001}	-0.2183	2.8107	0.6115	Thu May 22 05:36:17 2025
...	...	...	...	...	...	...	...	...
Model Tuning	11559	6	LGBMRegressor	{'num_leaves': 7, 'boosting_type': 'gbdt', 'subsample': 0.4, 'colsample_bytree': 0.4, 'max_depth': 2, 'reg_alpha': 1e-10, 'reg_lambda': 9.999999990000003e-07, 'n_estimators': 5, 'learning_rate': 0.1, 'min_child_weight': 0.001}	-1.1075	0.7727	0.6235	Thu May 22 05:36:45 2025
Model Tuning	11559	6	LGBMRegressor	{'num_leaves': 7, 'boosting_type': 'gbdt', 'subsample': 0.4, 'colsample_bytree': 0.4, 'max_depth': 2, 'reg_alpha': 9.999999990000003e-07, 'reg_lambda': 1e-10, 'n_estimators': 5, 'learning_rate': 0.1, 'min_child_weight': 0.001}	-1.1075	0.7874	0.6233	Thu May 22 05:36:44 2025
Model Tuning	11559	6	LGBMRegressor	{'num_leaves': 7, 'boosting_type': 'gbdt', 'subsample': 0.4, 'colsample_bytree': 0.4, 'max_depth': 2, 'reg_alpha': 1e-10, 'reg_lambda': 1.7784127779939314e-05, 'n_estimators': 5, 'learning_rate': 0.1, 'min_child_weight': 0.001}	-1.1075	0.7908	0.6235	Thu May 22 05:36:46 2025
Model Tuning	11559	6	LGBMRegressor	{'num_leaves': 7, 'boosting_type': 'gbdt', 'subsample': 0.4, 'colsample_bytree': 0.4, 'max_depth': 2, 'reg_alpha': 1e-10, 'reg_lambda': 1.8784127778939314e-05, 'n_estimators': 5, 'learning_rate': 0.1, 'min_child_weight': 0.001}	-1.1075	0.8087	0.6235	Thu May 22 05:36:46 2025
Model Tuning	11559	6	LGBMRegressor	{'num_leaves': 7, 'boosting_type': 'gbdt', 'subsample': 0.4, 'colsample_bytree': 0.4, 'max_depth': 2, 'reg_alpha': 1.7784127779939314e-05, 'reg_lambda': 1e-10, 'n_estimators': 5, 'learning_rate': 0.1, 'min_child_weight': 0.001}	-1.1075	0.8432	0.6233	Thu May 22 05:36:44 2025
Model Tuning	11559	6	LGBMRegressor	{'num_leaves': 7, 'boosting_type': 'gbdt', 'subsample': 0.4, 'colsample_bytree': 0.4, 'max_depth': 2, 'reg_alpha': 1.8784127778939314e-05, 'reg_lambda': 1e-10, 'n_estimators': 5, 'learning_rate': 0.1, 'min_child_weight': 0.001}	-1.1075	0.7551	0.6233	Thu May 22 05:36:45 2025
Model Tuning	11559	6	LGBMRegressor	{'num_leaves': 7, 'boosting_type': 'gbdt', 'subsample': 0.4, 'colsample_bytree': 0.4, 'max_depth': 2, 'reg_alpha': 1e-10, 'reg_lambda': 0.005623553830557401, 'n_estimators': 5, 'learning_rate': 0.1, 'min_child_weight': 0.001}	-1.1075	0.6737	0.6156	Thu May 22 05:36:46 2025
Model Tuning	11559	6	LGBMRegressor	{'num_leaves': 7, 'boosting_type': 'gbdt', 'subsample': 0.4, 'colsample_bytree': 0.4, 'max_depth': 2, 'reg_alpha': 1e-10, 'reg_lambda': 0.0056245538305564015, 'n_estimators': 5, 'learning_rate': 0.1, 'min_child_weight': 0.001}	-1.1075	0.9709	0.6156	Thu May 22 05:36:46 2025
Model Tuning	11559	6	LGBMRegressor	{'num_leaves': 7, 'boosting_type': 'gbdt', 'subsample': 0.4, 'colsample_bytree': 0.4, 'max_depth': 2, 'reg_alpha': 0.005623553830557401, 'reg_lambda': 1e-10, 'n_estimators': 5, 'learning_rate': 0.1, 'min_child_weight': 0.001}	-1.1075	0.8067	0.6233	Thu May 22 05:36:45 2025
Model Tuning	11559	6	LGBMRegressor	{'num_leaves': 7, 'boosting_type': 'gbdt', 'subsample': 0.4, 'colsample_bytree': 0.4, 'max_depth': 2, 'reg_alpha': 0.0056245538305564015, 'reg_lambda': 1e-10, 'n_estimators': 5, 'learning_rate': 0.1, 'min_child_weight': 0.001}	-1.1075	0.7816	0.6233	Thu May 22 05:36:45 2025

	feature	attribution	upper_bound	lower_bound
0	Latitude	1.262288	1.322257	1.202320
1	Longitude	1.073291	1.126496	1.020086
2	MedInc	0.369554	0.377468	0.361640
3	AveOccup	0.154384	0.164288	0.144480
4	AveRooms	0.116088	0.122427	0.109748
5	HouseAge	0.063371	0.065839	0.060902

Building and Explaining a Regressor using AutoMLx

Overview of this Notebook¶

Prerequisites:¶

Business Use:¶

Table of Contents¶

Setup¶

Load the California housing dataset using sklearn.datasets¶

AutoML¶

Setting the execution engine¶

Create an instance of Oracle AutoMLx¶

Train a model using Oracle AutoMLx¶

Analyze the AutoMLx optimization process¶

Algorithm Selection¶

Adaptive Sampling¶

Feature Selection¶

Hyperparameter Tuning¶

Advanced AutoMLx Configuration¶

Use a custom validation set¶

Machine Learning Explainability¶

Initialize an MLExplainer¶

Model Explanations (Global Feature importance)¶

Compute the importance¶

Visualization¶

Feature Dependence Explanations (PDP + ICE)¶

Prediction Explanations (Local Feature Importance)¶

Aggregate Local Feature Importance¶

Interactive What-If Explanations¶

Advanced Feature Importance Options¶

Configure prediction explanation¶

Include the effects of feature interactions (with Shapley feature importance)¶

Local feature importance with kernel_shap¶

Local feature importance using surrogate models (LIME+)¶

Explain the model or Explain the world¶

Explain the model with observational evaluator_type¶

Explain predictions with observational evaluator_type¶

Advanced Feature Dependence Options (ALE)¶

References¶