%matplotlib inline
%load_ext autoreload
%autoreload 2


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.figure_factory as ff
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_selector as selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_selector as selector
import time
import datetime
# Settings for plots
plt.rcParams['figure.figsize'] = [10, 7]
plt.rcParams['font.size'] = 15

import automlx
from automlx import init


dataset = fetch_openml(name='adult',version=1, as_frame=True)
df, y = dataset.data, dataset.target


df.head()


pd.DataFrame({'Data type': df.dtypes}).T


pd.DataFrame({'% missing values': df.isnull().sum() * 100 / len(df)}).T


y_df = pd.DataFrame(y)
y_df.columns = ['income']

fig = px.histogram(y_df, x="income")
fig.show()


# Several of the columns are incorrectly labeled as category type in the original dataset
numeric_columns = ['age', 'capitalgain', 'capitalloss', 'hoursperweek']
for col in df.columns:
    if col in numeric_columns:
        df[col] = df[col].astype(int)


X_train, X_test, y_train, y_test = train_test_split(df,
                                                    y.map({'>50K': 1, '<=50K': 0}).astype(int),
                                                    train_size=0.7,
                                                    random_state=0)

X_train.shape, X_test.shape

((34189, 14), (14653, 14))


init(engine='ray')

[2025-04-25 03:05:06,718] [automlx.backend] Overwriting ray session directory to /tmp/n4cg234n/ray, which will be deleted at engine shutdown. If you wish to retain ray logs, provide _temp_dir in ray_setup dict of engine_opts when initializing the AutoMLx engine.


est1 = automlx.Pipeline()
est1.fit(X_train, y_train)

[2025-04-25 03:05:12,061] [automlx.interface] Dataset shape: (34189,14)
[2025-04-25 03:05:19,855] [sanerec.autotuning.parameter] Hyperparameter epsilon autotune range is set to its validation range. This could lead to long training times
[2025-04-25 03:05:21,017] [sanerec.autotuning.parameter] Hyperparameter repeat_quality_threshold autotune range is set to its validation range. This could lead to long training times
[2025-04-25 03:05:21,041] [sanerec.autotuning.parameter] Hyperparameter scope autotune range is set to its validation range. This could lead to long training times
[2025-04-25 03:05:21,155] [automlx.data_transform] Running preprocessing. Number of features: 15
[2025-04-25 03:05:21,812] [automlx.data_transform] Preprocessing completed. Took 0.657 secs
[2025-04-25 03:05:21,854] [automlx.process] Running Model Generation
[2025-04-25 03:05:21,909] [automlx.process] KNeighborsClassifier is disabled. The KNeighborsClassifier model is only recommended for datasets with less than 10000 samples and 1000 features.
[2025-04-25 03:05:21,909] [automlx.process] SVC is disabled. The SVC model is only recommended for datasets with less than 10000 samples and 1000 features.
[2025-04-25 03:05:21,911] [automlx.process] Model Generation completed.
[2025-04-25 03:05:21,985] [automlx.model_selection] Running Model Selection
[2025-04-25 03:05:53,108] [automlx.model_selection] Model Selection completed - Took 31.123 sec - Selected models: [['XGBClassifier']]
[2025-04-25 03:05:53,134] [automlx.adaptive_sampling] Running Adaptive Sampling. Dataset shape: (34189,16).
[2025-04-25 03:05:55,173] [automlx.trials] Adaptive Sampling completed - Took 2.0383 sec.
[2025-04-25 03:05:55,267] [automlx.feature_selection] Starting feature ranking for XGBClassifier
[2025-04-25 03:06:03,981] [automlx.feature_selection] Feature Selection completed. Took 8.730 secs.
[2025-04-25 03:06:04,028] [automlx.trials] Running Model Tuning for ['XGBClassifier']
[2025-04-25 03:06:47,769] [automlx.trials] Best parameters for XGBClassifier: {'learning_rate': 0.10242113515453982, 'min_child_weight': 2, 'max_depth': 4, 'reg_alpha': 0.0007113117640155693, 'booster': 'gbtree', 'reg_lambda': 1.001, 'n_estimators': 141, 'use_label_encoder': False}
[2025-04-25 03:06:47,770] [automlx.trials] Model Tuning completed. Took: 43.742 secs
[2025-04-25 03:06:54,215] [automlx.interface] Re-fitting pipeline
[2025-04-25 03:06:54,230] [automlx.final_fit] Skipping updating parameter seed, already fixed by FinalFit_806b0073-0
[2025-04-25 03:06:56,578] [automlx.interface] AutoMLx completed.

<automlx._interface.classifier.AutoClassifier at 0x151aab3c6d00>


y_proba = est1.predict_proba(X_test)
score_default = roc_auc_score(y_test, y_proba[:, 1])

print(f'Score on test data : {score_default}')

Score on test data : 0.9140858492615117


est1.print_summary()


# Each trial is a row in a dataframe that contains
# Algorithm, Number of Samples, Number of Features, Hyperparameters, Score, Runtime, Memory Usage, Step as features
trials = est1.completed_trials_summary_[est1.completed_trials_summary_["Step"].str.contains('Model Selection')]
name_of_score_column = f"Score ({est1._inferred_score_metric[0].name})"
trials.replace([np.inf, -np.inf], np.nan, inplace=True)
trials.dropna(subset=[name_of_score_column], inplace = True)
scores = trials[name_of_score_column].tolist()
models = trials['Algorithm'].tolist()
colors = []

y_margin = 0.10 * (max(scores) - min(scores))
s = pd.Series(scores, index=models).sort_values(ascending=False)
s = s.dropna()
for f in s.keys():
    if f.strip()  ==  est1.selected_model_.strip():
        colors.append('orange')
    elif s[f] >= s.mean():
        colors.append('teal')
    else:
        colors.append('turquoise')

fig, ax = plt.subplots(1)
ax.set_title("Algorithm Selection Trials")
ax.set_ylim(min(scores) - y_margin, max(scores) + y_margin)
ax.set_ylabel(est1._inferred_score_metric[0].name)
s.plot.bar(ax=ax, color=colors, edgecolor='black')
ax.axhline(y=s.mean(), color='black', linewidth=0.5)
plt.show()


# Each trial is a row in a dataframe that contains
# Algorithm, Number of Samples, Number of Features, Hyperparameters, Score, Runtime, Memory Usage, Step as features
trials = est1.completed_trials_summary_[est1.completed_trials_summary_["Step"].str.contains('Adaptive Sampling')]
trials.replace([np.inf, -np.inf], np.nan, inplace=True)
trials.dropna(subset=[name_of_score_column], inplace = True)
scores = trials[name_of_score_column].tolist()
n_samples = [int(sum(d.values()) / len(d)) if isinstance(d, dict) else d for d in trials['# Samples']]


y_margin = 0.10 * (max(scores) - min(scores))
fig, ax = plt.subplots(1)
ax.set_title("Adaptive Sampling ({})".format(est1.selected_model_))
ax.set_xlabel('Dataset sample size')
ax.set_ylabel(est1._inferred_score_metric[0].name)
ax.grid(color='g', linestyle='-', linewidth=0.1)
ax.set_ylim(min(scores) - y_margin, max(scores) + y_margin)
ax.plot(n_samples, scores, 'k:', marker="s", color='teal', markersize=3)
plt.show()


print(f"Features selected: {est1.selected_features_names_}")
dropped_features = df.drop(est1.selected_features_names_raw_, axis=1).columns
print(f"Features dropped: {dropped_features.to_list()}")

# Each trial is a row in a dataframe that contains
# Algorithm, Number of Samples, Number of Features, Hyperparameters, Score, Runtime, Memory Usage, Step as features
trials = est1.completed_trials_summary_[est1.completed_trials_summary_["Step"].str.contains('Feature Selection')]
trials.replace([np.inf, -np.inf], np.nan, inplace=True)
trials.dropna(subset=[name_of_score_column], inplace = True)
trials.sort_values(by=['# Features'],ascending=True, inplace = True)
scores = trials[name_of_score_column].tolist()
n_features = trials['# Features'].tolist()

y_margin = 0.10 * (max(scores) - min(scores))
fig, ax = plt.subplots(1)
ax.set_title("Feature Selection Trials")
ax.set_xlabel("Number of Features")
ax.set_ylabel(est1._inferred_score_metric[0].name)
ax.grid(color='g', linestyle='-', linewidth=0.1)
ax.set_ylim(min(scores) - y_margin, max(scores) + y_margin)
ax.plot(n_features, scores, 'k:', marker="s", color='teal', markersize=3)
ax.axvline(x=len(est1.selected_features_names_), color='orange', linewidth=2.0)
plt.show()

Features selected: ['age', 'capitalgain', 'capitalloss', 'education', 'education-num', 'fnlwgt', 'hoursperweek', 'marital-status', 'native-country', 'occupation', 'race', 'relationship', 'sex_1', 'sex_2', 'workclass']
Features dropped: []


# Each trial is a row in a dataframe that contains
# Algorithm, Number of Samples, Number of Features, Hyperparameters, Score, Runtime, Memory Usage, Step as features
trials = est1.completed_trials_summary_[est1.completed_trials_summary_["Step"].str.contains('Model Tuning')]
trials.replace([np.inf, -np.inf], np.nan, inplace=True)
trials.dropna(subset=[name_of_score_column], inplace = True)
trials.drop(trials[trials['Finished'] == -1].index, inplace = True)
trials['Finished']= trials['Finished'].apply(lambda x: time.mktime(datetime.datetime.strptime(x,
                                             "%a %b %d %H:%M:%S %Y").timetuple()))
trials.sort_values(by=['Finished'],ascending=True, inplace = True)
scores = trials[name_of_score_column].tolist()
score = []
score.append(scores[0])
for i in range(1,len(scores)):
    if scores[i]>= score[i-1]:
        score.append(scores[i])
    else:
        score.append(score[i-1])
y_margin = 0.10 * (max(score) - min(score))

fig, ax = plt.subplots(1)
ax.set_title("Hyperparameter Tuning Trials")
ax.set_xlabel("Iteration $n$")
ax.set_ylabel(est1._inferred_score_metric[0].name)
ax.grid(color='g', linestyle='-', linewidth=0.1)
ax.set_ylim(min(score) - y_margin, max(score) + y_margin)
ax.plot(range(1, len(trials) + 1), score, 'k:', marker="s", color='teal', markersize=3)
plt.show()


y_pred = est1.predict(X_test)
cm = confusion_matrix(y_test.astype(int), y_pred, labels=[False, True])
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

text = [[f"{y*100:.2f}" for y in x] for x in cm]
fig = ff.create_annotated_heatmap(cm, x=['<=50K', '>50K'], y=['<=50K', '>50K'], annotation_text=text, colorscale='Viridis')
fig.add_annotation(dict(font=dict(color="black",size=14),
                        x=0.5,
                        y=-0.15,
                        showarrow=False,
                        text="Predicted value",
                        xref="paper",
                        yref="paper"))

fig.add_annotation(dict(font=dict(color="black",size=14),
                        x=-0.15,
                        y=0.5,
                        showarrow=False,
                        text="Actual",
                        textangle=-90,
                        xref="paper",
                        yref="paper"))
fig.update_layout(margin=dict(t=50, l=150))
fig.show()


custom_pipeline = automlx.Pipeline(
    task='classification',
    model_list=[                 # Specify the models you want the AutoMLx to consider
        'LogisticRegression',
        'LGBMClassifier',
        'GaussianNB'
    ],
    n_algos_tuned=2,             # Choose how many models to tune
    min_features=[               # Specify minimum features to force the model to use. It can take 3 possible types of values:
        'native-country',        # If int, 0 < min_features <= n_features,
        'marital-status',              # If float, 0 < min_features <= 1.0, 1.0 means disabling feature selection
        'education-num'          # If list, names of features to keep, for example ['a', 'b'] means keep features 'a' and 'b'
    ],
    adaptive_sampling=False,     # Disable or enable Adaptive Sampling step. Default to `True`
    preprocessing=True,          # Disable or enable Preprocessing step. Default to `True`
    search_space={               # You can specify the hyper-parameters and ranges we search
        'LGBMClassifier': {
            'learning_rate': {'range': [0.01, 10], 'type': 'continuous'},
            'boosting_type': {'range': ['gbdt', 'dart'], 'type': 'categorical'},
        },
    },
    max_tuning_trials=2,         # The maximum number of tuning trials. Can be integer or Dict (max number for each model)
    score_metric='f1_macro',     # Any scikit-learn metric or a custom function
)


X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.7, random_state=0)


custom_pipeline.fit(
    X_train,
    y_train,
    X_val,
    y_val,
    time_budget= 20,    # Specify time budget in seconds
    cv='auto'           # Automatically pick a good cross-validation (cv) strategy for the user's dataset.
                        # Ignored if X_valid and y_valid are provided.
                        # Can also be:
                        #   - An integer (For example, to use 5-fold cross validation)
                        #   - A list of data indices to use as splits (for advanced, such as time-based splitting)
)
y_proba = custom_pipeline.predict_proba(X_test)
score_modellist = roc_auc_score(y_test, y_proba[:, 1])

print(f'ROC AUC Score on test data : {score_modellist}')

[2025-04-25 03:07:01,457] [automlx.interface] Dataset shape: (34189,14)
[2025-04-25 03:07:01,518] [automlx.interface] Adaptive Sampling disabled.
[2025-04-25 03:07:01,563] [automlx.data_transform] Running preprocessing. Number of features: 15
[2025-04-25 03:07:02,094] [automlx.data_transform] Preprocessing completed. Took 0.531 secs
[2025-04-25 03:07:02,127] [automlx.process] Running Model Generation
[2025-04-25 03:07:02,185] [automlx.process] Model Generation completed.
[2025-04-25 03:07:02,217] [automlx.model_selection] Running Model Selection
[2025-04-25 03:07:03,661] [automlx.model_selection] Model Selection completed - Took 1.444 sec - Selected models: [['LGBMClassifier', 'LogisticRegressionClassifier']]
[2025-04-25 03:07:03,746] [automlx.feature_selection] Starting feature ranking for LGBMClassifier
[2025-04-25 03:07:07,088] [automlx.feature_selection] Feature Selection completed. Took 3.342 secs.
[2025-04-25 03:07:07,215] [automlx.trials] Running Model Tuning for ['LGBMClassifier']
[2025-04-25 03:07:08,436] [automlx.trials] Best parameters for LGBMClassifier: {'num_leaves': 31, 'boosting_type': 'gbdt', 'learning_rate': 0.01, 'min_child_weight': 0.001, 'max_depth': -1, 'reg_alpha': 0, 'reg_lambda': 1, 'n_estimators': 100, 'class_weight': None}
[2025-04-25 03:07:08,437] [automlx.trials] Model Tuning completed. Took: 1.222 secs
[2025-04-25 03:07:08,521] [automlx.feature_selection] Starting feature ranking for LogisticRegressionClassifier
[2025-04-25 03:07:15,245] [automlx.feature_selection] Feature Selection completed. Took 6.724 secs.
[2025-04-25 03:07:15,272] [automlx.trials] Running Model Tuning for ['LogisticRegressionClassifier']
[2025-04-25 03:07:16,400] [automlx.trials] Best parameters for LogisticRegressionClassifier: {'C': 0.0363696875, 'solver': 'liblinear', 'class_weight': 'balanced'}
[2025-04-25 03:07:16,401] [automlx.trials] Model Tuning completed. Took: 1.130 secs
[2025-04-25 03:07:16,754] [automlx.interface] Re-fitting pipeline
[2025-04-25 03:07:16,780] [automlx.final_fit] Skipping updating parameter seed, already fixed by FinalFit_33c32b58-6
[2025-04-25 03:07:18,836] [automlx.interface] AutoMLx completed.
ROC AUC Score on test data : 0.901457699826238


explainer = automlx.MLExplainer(est1,
                               X_train,
                               y_train)


result_explain_model_default = explainer.explain_model(
    n_iter=5,                            # Can also be 'auto' to pick a good value for the explainer and task

    scoring_metric='balanced_accuracy',  # Global feature importance measures how much each feature improved the
                                         # model's score. Users can chose the scoring metric used here.
)


result_explain_model_default.to_dataframe()


result_explain_model_default.show_in_notebook()


result_explain_feature_dependence_default = explainer.explain_feature_dependence('education-num')
result_explain_feature_dependence_default.show_in_notebook()


result_explain_feature_dependence_default.show_in_notebook(ice=True)


result_explain_feature_dependence_default = explainer.explain_feature_dependence(['education-num', 'hoursperweek', 'sex'])
result_explain_feature_dependence_default.show_in_notebook()


result_explain_prediction_default = explainer.explain_prediction(X_train.sample(n=10))
result_explain_prediction_default[0].show_in_notebook()


alfi = explainer.aggregate(explanations=result_explain_prediction_default)
alfi.show_in_notebook()


explainer.explore_whatif(X_test, y_test)


explainer.configure_explain_counterfactual(strategy='ace')
explanations = explainer.explain_counterfactual(X_test[0:1],
                                               n_counterfactuals=3,
                                               desired_pred='auto',
                                               features_to_fix=['age'])
explanations[0].show_in_notebook()


numeric_transformer = Pipeline(
   steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
   transformers=[
       ("num", numeric_transformer, selector(dtype_exclude=[object, 'category'])),
       ("cat", categorical_transformer, selector(dtype_include=[object, 'category'])),
   ]
)
scikit_model = Pipeline(
   steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)


scikit_model.fit(X_train, y_train)

explainer_sklearn = automlx.MLExplainer(
                              scikit_model,
                              X_train,
                              y_train,
                              target_names=[             # Used for plot labels/legends.
                                          "<=50K",
                                          ">50K"
                                          ],
                              selected_features='auto',  # These features are used by the model; automatically inferred for AutoML Pipelines,
                              task="classification",
                              col_types=None             # Specify type of features
                             )


explainer_sklearn.configure_explain_prediction(tabulator_type="kernel_shap",
                                              sampling={'technique': 'random', 'n_samples': 2000}
                                              )


explainer_sklearn.configure_explain_prediction(
                                               explainer_type='surrogate',
                                              method='lime'
                                              )


index = 0
result_explain_prediction_kernel_shap = explainer_sklearn.explain_prediction(X_train.iloc[index:index+1,:])
result_explain_prediction_kernel_shap[0].show_in_notebook()


explainer_sklearn.configure_explain_model(evaluator_type="observational")


result_explain_model_observational = explainer_sklearn.explain_model()
result_explain_model_observational.show_in_notebook()


explainer_sklearn.configure_explain_feature_dependence(explanation_type='ale')
result_explain_feature_dependence_default = explainer_sklearn.explain_feature_dependence(['education-num', 'sex'])
result_explain_feature_dependence_default.show_in_notebook()

	age	workclass	fnlwgt	education	education-num	marital-status	occupation	relationship	race	sex	capitalgain	hoursperweek	native-country
0	2	State-gov	77516.0	Bachelors	13.0	Never-married	Adm-clerical	Not-in-family	White	Male	1	2	United-States
1	3	Self-emp-not-inc	83311.0	Bachelors	13.0	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	0	United-States
2	2	Private	215646.0	HS-grad	9.0	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	2	United-States
3	3	Private	234721.0	11th	7.0	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0	2	United-States
4	1	Private	338409.0	Bachelors	13.0	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0	2	Cuba

Step	# Samples	# Features	Algorithm	Hyperparameters	Score (neg_log_loss)	All Metrics	Runtime (Seconds)	Memory Usage (GB)	Finished
Model Selection	{1: 4000, 2: 4000, 3: 4000, 5: 4000, 4: 4000}	15	XGBClassifier	{'learning_rate': 0.1, 'min_child_weight': 1, 'max_depth': 3, 'reg_alpha': 0, 'booster': 'gbtree', 'reg_lambda': 1, 'n_estimators': 100, 'use_label_encoder': False}	-0.3813	{'neg_log_loss': -0.38129734602336923}	1.1177	0.3214	Fri Apr 25 03:05:43 2025
Model Selection	{2: 4000, 5: 4000, 1: 4000, 3: 4000, 4: 4000}	15	CatBoostClassifier	{'iterations': 235, 'learning_rate': 0.787168, 'leaf_estimation_method': 'Newton', 'colsample_bylevel': 0.096865, 'depth': 3, 'l2_leaf_reg': 2.567326, 'feature_border_type': 'UniformAndQuantiles', 'model_size_reg': 3.85132, 'leaf_estimation_iterations': 1, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS', 'auto_class_weights': 'SqrtBalanced', 'allow_writing_files': False, 'allow_const_label': True}	-0.3881	{'neg_log_loss': -0.38808832474759763}	10.8342	0.3193	Fri Apr 25 03:05:42 2025
Model Selection	{3: 4000, 1: 4000, 5: 4000, 4: 4000, 2: 4000}	15	LGBMClassifier	{'num_leaves': 31, 'boosting_type': 'gbdt', 'learning_rate': 0.1, 'min_child_weight': 0.001, 'max_depth': -1, 'reg_alpha': 0, 'reg_lambda': 1, 'n_estimators': 100, 'class_weight': 'balanced'}	-0.3898	{'neg_log_loss': -0.38980519262954033}	12.8814	0.3083	Fri Apr 25 03:05:43 2025
Model Selection	{1: 4000, 5: 4000, 3: 4000, 4: 4000, 2: 4000}	15	LogisticRegressionClassifier	{'C': 1.0, 'solver': 'liblinear', 'class_weight': 'balanced'}	-0.3908	{'neg_log_loss': -0.3907935186356912}	0.6874	0.3279	Fri Apr 25 03:05:43 2025
Model Selection	{1: 4000, 2: 4000, 3: 4000, 4: 4000, 5: 4000}	15	RandomForestClassifier	{'n_estimators': 100, 'min_samples_split': 0.00125, 'min_samples_leaf': 0.000625, 'max_features': 0.777777778, 'class_weight': 'balanced'}	-0.4093	{'neg_log_loss': -0.40930045927988684}	2.7773	0.2950	Fri Apr 25 03:05:43 2025
Model Selection	{2: 4000, 1: 4000, 3: 4000, 4: 4000, 5: 4000}	15	ExtraTreesClassifier	{'n_estimators': 100, 'min_samples_split': 0.00125, 'min_samples_leaf': 0.000625, 'max_features': 0.777777778, 'class_weight': 'balanced', 'criterion': 'gini'}	-0.4196	{'neg_log_loss': -0.4196315514455088}	3.1310	0.2772	Fri Apr 25 03:05:41 2025
Model Selection	{3: 4000, 5: 4000, 4: 4000, 2: 4000, 1: 4000}	15	TorchMLPClassifier	{'optimizer_class': 'Adam', 'shuffle_dataset_each_epoch': True, 'optimizer_params': {}, 'criterion_class': None, 'criterion_params': {}, 'scheduler_class': None, 'scheduler_params': {}, 'batch_size': 128, 'lr': 0.001, 'epochs': 18, 'input_transform': 'auto', 'tensorboard_dir': None, 'use_tqdm': None, 'prediction_batch_size': 128, 'prediction_input_transform': 'auto', 'shuffling_buffer_size': None, 'depth': 4, 'num_logits': 1000, 'div_factor': 2, 'activation': 'ReLU', 'dropout': 0.1}	-0.8294	{'neg_log_loss': -0.8293741212177753}	49.1100	0.6041	Fri Apr 25 03:05:52 2025
Model Selection	{1: 4000, 2: 4000, 3: 4000, 4: 4000, 5: 4000}	15	GaussianNB	{}	-1.0622	{'neg_log_loss': -1.0622002049203993}	0.3964	0.2871	Fri Apr 25 03:05:41 2025
Model Selection	{4: 4000, 3: 4000, 1: 4000, 2: 4000, 5: 4000}	15	DecisionTreeClassifier	{'min_samples_split': 0.00125, 'min_samples_leaf': 0.000625, 'max_features': 1.0, 'class_weight': None}	-4.6323	{'neg_log_loss': -4.632288461805368}	1.3565	0.2737	Fri Apr 25 03:05:41 2025
Adaptive Sampling	{1: 15493, 2: 15493, 3: 15493, 4: 15493, 5: 15494}	15	AdaptiveSamplingStage_XGBClassifier	{'learning_rate': 0.1, 'min_child_weight': 1, 'max_depth': 3, 'reg_alpha': 0, 'booster': 'gbtree', 'reg_lambda': 1, 'n_estimators': 100, 'use_label_encoder': False}	-0.349	{'neg_log_loss': -0.3490238812123456}	3.3692	0.6036	Fri Apr 25 03:05:54 2025
...	...	...	...	...	...	...	...	...	...
Model Tuning	{1: 12222, 2: 12222, 3: 12222, 4: 12223, 5: 12223}	15	XGBClassifier	{'learning_rate': 0.0001, 'min_child_weight': 0, 'max_depth': 2, 'reg_alpha': 0.0007113117640155693, 'booster': 'gbtree', 'reg_lambda': 0, 'n_estimators': 50, 'use_label_encoder': False}	-0.6912	{'neg_log_loss': -0.6911964137047958}	1.2147	0.6153	Fri Apr 25 03:06:19 2025
Model Tuning	{1: 12222, 2: 12222, 3: 12222, 4: 12223, 5: 12223}	15	XGBClassifier	{'learning_rate': 0.0001, 'min_child_weight': 0, 'max_depth': 2, 'reg_alpha': 0.0007513117640155693, 'booster': 'gbtree', 'reg_lambda': 0, 'n_estimators': 50, 'use_label_encoder': False}	-0.6912	{'neg_log_loss': -0.6911964137047958}	1.2364	0.6092	Fri Apr 25 03:06:20 2025
Model Tuning	{1: 12222, 2: 12222, 3: 12222, 4: 12223, 5: 12223}	15	XGBClassifier	{'learning_rate': 0.0001, 'min_child_weight': 0, 'max_depth': 2, 'reg_alpha': 0, 'booster': 'gbtree', 'reg_lambda': 0.01778279410038923, 'n_estimators': 50, 'use_label_encoder': False}	-0.6912	{'neg_log_loss': -0.6911964480070908}	1.1856	0.6153	Fri Apr 25 03:06:20 2025
Model Tuning	{1: 12222, 2: 12222, 3: 12222, 4: 12223, 5: 12223}	15	XGBClassifier	{'learning_rate': 0.0001, 'min_child_weight': 0, 'max_depth': 2, 'reg_alpha': 0, 'booster': 'gbtree', 'reg_lambda': 0.01878279410038923, 'n_estimators': 50, 'use_label_encoder': False}	-0.6912	{'neg_log_loss': -0.6911964560751593}	1.1809	0.6098	Fri Apr 25 03:06:20 2025
Model Tuning	{1: 12222, 2: 12222, 3: 12222, 4: 12223, 5: 12223}	15	XGBClassifier	{'learning_rate': 0.0001, 'min_child_weight': 0, 'max_depth': 2, 'reg_alpha': 0.2249365300761397, 'booster': 'gbtree', 'reg_lambda': 0, 'n_estimators': 50, 'use_label_encoder': False}	-0.6912	{'neg_log_loss': -0.6911968446197632}	1.2241	0.6153	Fri Apr 25 03:06:20 2025
Model Tuning	{1: 12222, 2: 12222, 3: 12222, 4: 12223, 5: 12223}	15	XGBClassifier	{'learning_rate': 0.0001, 'min_child_weight': 0, 'max_depth': 2, 'reg_alpha': 0.2249765300761397, 'booster': 'gbtree', 'reg_lambda': 0, 'n_estimators': 50, 'use_label_encoder': False}	-0.6912	{'neg_log_loss': -0.6911968446197632}	1.1658	0.6098	Fri Apr 25 03:06:20 2025
Model Tuning	{1: 12222, 2: 12222, 3: 12222, 4: 12223, 5: 12223}	15	XGBClassifier	{'learning_rate': 0.0001, 'min_child_weight': 0, 'max_depth': 2, 'reg_alpha': 0, 'booster': 'gbtree', 'reg_lambda': 1, 'n_estimators': 50, 'use_label_encoder': False}	-0.6912	{'neg_log_loss': -0.6911999162246485}	1.2465	0.6134	Fri Apr 25 03:06:20 2025
Model Tuning	{1: 12222, 2: 12222, 3: 12222, 4: 12223, 5: 12223}	15	XGBClassifier	{'learning_rate': 0.0001, 'min_child_weight': 0, 'max_depth': 2, 'reg_alpha': 0, 'booster': 'gbtree', 'reg_lambda': 1.001, 'n_estimators': 50, 'use_label_encoder': False}	-0.6912	{'neg_log_loss': -0.6911999174202924}	1.2102	0.6153	Fri Apr 25 03:06:20 2025
Model Tuning	{1: 12222, 2: 12222, 3: 12222, 4: 12223, 5: 12223}	15	XGBClassifier	{'learning_rate': 0.0001, 'min_child_weight': 0, 'max_depth': 2, 'reg_alpha': 0, 'booster': 'gbtree', 'reg_lambda': 5.623413251903491, 'n_estimators': 50, 'use_label_encoder': False}	-0.6912	{'neg_log_loss': -0.6912154271501312}	1.1138	0.6098	Fri Apr 25 03:06:21 2025
Model Tuning	{1: 12222, 2: 12222, 3: 12222, 4: 12223, 5: 12223}	15	XGBClassifier	{'learning_rate': 0.0001, 'min_child_weight': 0, 'max_depth': 2, 'reg_alpha': 0, 'booster': 'gbtree', 'reg_lambda': 5.6244132519034915, 'n_estimators': 50, 'use_label_encoder': False}	-0.6912	{'neg_log_loss': -0.691215428004673}	1.2120	0.6153	Fri Apr 25 03:06:21 2025

	Feature	Attribution	Lower Bound	Upper Bound
0	capitalgain	0.071744	0.062587	0.080901
1	capitalloss	0.001740	-0.000836	0.004317
2	age	0.001172	-0.000865	0.003210
3	fnlwgt	0.000606	-0.001714	0.002927
4	hoursperweek	0.000125	-0.001286	0.001536
5	education-num	0.000085	-0.000435	0.000604
6	native-country	0.000000	0.000000	0.000000
7	race	0.000000	0.000000	0.000000
8	education	0.000000	0.000000	0.000000
9	sex	0.000000	0.000000	0.000000
10	relationship	0.000000	0.000000	0.000000
11	workclass	0.000000	0.000000	0.000000
12	marital-status	0.000000	0.000000	0.000000
13	occupation	0.000000	0.000000	0.000000

Building and Explaining a Classifier using AutoMLx

Overview of this Notebook¶

Prerequisites¶

Business Use¶

Table of Contents¶

Setup¶

Load the Census Income dataset¶

AutoML¶

Setting the execution engine¶

Create an instance of Oracle AutoMLx¶

Train a model using AutoMLx¶

Analyze the AutoMLx optimization process¶

Algorithm Selection¶

Adaptive Sampling¶

Feature Selection¶

Hyperparameter Tuning¶

Confusion Matrix¶

Advanced AutoMLx Configuration¶

Use a custom validation set¶

Machine Learning Explainability¶

Initializing an MLExplainer¶

Model Explanations (Global Feature importance)¶

Computing the importance¶

Visualization¶

Feature Dependence Explanations (PDP + ICE)¶

Prediction Explanations (Local Feature Importance)¶

Aggregate Local Feature Importance¶

Interactive What-If Explainers¶

Counterfactual Explanations¶

Advanced Feature Importance Options¶

Configure prediction explanation¶

Include the effects of feature interactions (with Shapley feature importance)¶

Local feature importance with kernel_shap¶

Local feature importance using surrogate models (LIME+)¶

Explain the model or Explain the world¶

Explain the model with observational evaluator_type¶

Advanced Feature Dependence Options (ALE)¶

References¶


(34189, 14)
None
KFoldSplit(Shuffle=True, Seed=7, folds=5, stratify by=target)
neg_log_loss
XGBClassifier
{'learning_rate': 0.10242113515453982, 'min_child_weight': 2, 'max_depth': 4, 'reg_alpha': 0.0007113117640155693, 'booster': 'gbtree', 'reg_lambda': 1.001, 'n_estimators': 141, 'use_label_encoder': False}
25.2.1
3.9.21 (main, Dec 11 2024, 16:24:11) \n[GCC 11.2.0]