import datetime
import logging
import os
import time
import urllib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from automlx import AutoRecommender, init

# Settings for plots
plt.rcParams["figure.figsize"] = [10, 7]
plt.rcParams["font.size"] = 15

# Silence unnecessary warnings
logging.getLogger("sanerec.autotuning.parameter").setLevel(logging.ERROR)

# Initialize the parallelization engine of AutoMLx
init(engine='ray', engine_opts={"ray_setup": {"log_to_driver": False}})

[2025-04-25 03:34:51,937] [automlx.backend] Overwriting ray session directory to /tmp/1frrc9a7/ray, which will be deleted at engine shutdown. If you wish to retain ray logs, provide _temp_dir in ray_setup dict of engine_opts when initializing the AutoMLx engine.


get_ipython().system(' wget https://files.grouplens.org/datasets/movielens/ml-100k/u.data --no-check-certificate -q -O ./ml100k_interactions.tsv')


dataset = pd.read_csv(
    "./ml100k_interactions.tsv",
    sep="\t",
    names=["user_id", "movie_id", "rating", "timestamp"],
).sample(frac=0.5, random_state=1)

dataset.head(5)


dataset = dataset.set_index("timestamp")
dataset.head(5)


col_types = {"movie_id": "recommendation", "user_id": "recommendation_subject"}


training_data, test_data = AutoRecommender.train_test_split(data=dataset, col_types=col_types)


automl_pipeline = AutoRecommender().configure()


automl_pipeline = automl_pipeline.fit(data=training_data, col_types=col_types)

[2025-04-25 03:35:07,292] [automlx.interface] Dataset shape: (49055,3)
[2025-04-25 03:35:07,362] [automlx.process] Running Model Generation
[2025-04-25 03:35:07,402] [automlx.process] Model Generation completed.
[2025-04-25 03:35:07,446] [automlx.model_selection] Running Model Selection
[2025-04-25 03:35:56,451] [automlx.model_selection] Model Selection completed - Took 49.005 sec - Selected models: [['ItemKNNRecommender']]
[2025-04-25 03:35:56,513] [automlx.trials] Running Model Tuning for ['ItemKNNRecommender']
[2025-04-25 03:36:12,913] [automlx.trials] Best parameters for ItemKNNRecommender: {'n_recommendations': 10, 'num_of_neighbors': 506, 'bias': 0.0001, 'hist_len': 10, 'reciprocal_ranking': False, 'normalize_scores': False, 'cache_users_states': True}
[2025-04-25 03:36:12,915] [automlx.trials] Model Tuning completed. Took: 16.401 secs
[2025-04-25 03:36:13,337] [automlx.interface] Re-fitting pipeline
[2025-04-25 03:36:13,344] [automlx.final_fit] Skipping updating parameter seed, already fixed by FinalFit_19cf1d36-6
[2025-04-25 03:36:14,240] [automlx.interface] AutoMLx completed.


recommendation_subjects = test_data.sample(1)[['user_id']]
automl_pipeline.predict(subjects=recommendation_subjects, n_recommendations=5)


automl_pipeline.print_summary()


def plot_model_selection_scores(_pipeline):
    # Each trial is a row in a dataframe that contains
    # Algorithm, Number of Samples, Number of Features, Hyperparameters, Score, Runtime, Memory Usage, Step as features
    trials = _pipeline.completed_trials_summary_[
        _pipeline.completed_trials_summary_["Step"].str.contains("Model Selection")
    ]
    name_of_score_column = f"Score ({_pipeline._inferred_score_metric[0].name})"
    trials.replace([np.inf, -np.inf], np.nan, inplace=True)
    trials.dropna(subset=[name_of_score_column], inplace=True)
    scores = trials[name_of_score_column].tolist()
    models = trials["Algorithm"].tolist()

    y_margin = 0.10 * (max(scores) - min(scores))
    s = pd.Series(scores, index=models).sort_values(ascending=False)

    colors = []
    for f in s.keys():
        if f.strip() == _pipeline.selected_model_.strip():
            colors.append("orange")
        elif s[f] >= s.mean():
            colors.append("teal")
        else:
            colors.append("turquoise")

    fig, ax = plt.subplots(1)
    ax.set_title("Algorithm Selection Trials")
    ax.set_ylim(min(scores) - y_margin, max(scores) + y_margin)
    ax.set_ylabel("Hit Rate")
    s.plot.bar(ax=ax, color=colors, edgecolor="black")
    ax.axhline(y=s.mean(), color="black", linewidth=0.5)
    plt.show()

plot_model_selection_scores(automl_pipeline)


def plot_hp_tuning_scores(_pipeline):
    # Each trial is a row in a dataframe that contains
    # Algorithm, Number of Samples, Number of Features, Hyperparameters, Score, Runtime, Memory Usage, Step as features
    trials = _pipeline.completed_trials_summary_[
        _pipeline.completed_trials_summary_["Step"].str.contains("Model Tuning")
    ]
    name_of_score_column = f"Score ({_pipeline._inferred_score_metric[0].name})"
    trials.replace([np.inf, -np.inf], np.nan, inplace=True)
    trials.dropna(subset=[name_of_score_column], inplace=True)
    trials.drop(trials[trials["Finished"] == -1].index, inplace=True)
    trials["Finished"] = trials["Finished"].apply(
        lambda x: time.mktime(datetime.datetime.strptime(x, "%a %b %d %H:%M:%S %Y").timetuple())
    )
    trials.sort_values(by=["Finished"], ascending=True, inplace=True)
    scores = trials[name_of_score_column].tolist()
    score = []
    score.append(scores[0])
    for i in range(1, len(scores)):
        if scores[i] >= score[i - 1]:
            score.append(scores[i])
        else:
            score.append(score[i - 1])
    y_margin = 0.10 * (max(score) - min(score))
    fig, ax = plt.subplots(1)
    ax.set_title("Hyperparameter Tuning Trials")
    ax.set_xlabel("Iteration $n$")
    ax.set_ylabel("Hit Rate")
    ax.grid(color="g", linestyle="-", linewidth=0.1)
    ax.set_ylim(min(score) - y_margin, max(score) + y_margin)
    ax.plot(range(1, len(trials) + 1), score, "k:", marker="s", color="teal", markersize=3)
    plt.show()

plot_hp_tuning_scores(automl_pipeline)


custom_pipeline = AutoRecommender().configure(
    model_list=[  # Specify the models you want the AutoMLx to consider
        "ItemKNNRecommender",
        "AlsRecommender",
        "BprRecommender",
    ],
    n_algos_tuned=2,  # Choose how many models to tune
    search_space={  # You can specify the hyperparameters and ranges we search for each model
        "ItemKNNRecommender": {"num_of_neighbors": {"range": [10, 30], "type": "continuous"}}
    },
    max_tuning_trials=20,  # The maximum number of tuning trials. Can be integer or Dict (max number for each model)
    score_metric="recall",  # Any of the metrics available, see the documentation for a list of supported values
)


training_data, validation_data = AutoRecommender.train_test_split(data=training_data, col_types=col_types)


# We run again the AutoML pipeline with the custom training/validation split we just created, and some advanced settings that we can specify directly in the fit method.


custom_pipeline = custom_pipeline.fit(
    training_data,
    col_types,
    validation_data,
    time_budget=20,  # Specify time budget in seconds
)

[2025-04-25 03:36:15,105] [automlx.interface] Dataset shape: (49055,3)
[2025-04-25 03:36:15,170] [automlx.process] Running Model Generation
[2025-04-25 03:36:15,215] [automlx.process] Model Generation completed.
[2025-04-25 03:36:15,246] [automlx.model_selection] Running Model Selection
[2025-04-25 03:36:16,875] [automlx.model_selection] Model Selection completed - Took 1.629 sec - Selected models: [['ItemKNNRecommender', 'AlsRecommender']]
[2025-04-25 03:36:16,953] [automlx.trials] Running Model Tuning for ['ItemKNNRecommender']
[2025-04-25 03:36:20,323] [automlx.trials] Best parameters for ItemKNNRecommender: {'n_recommendations': 10, 'num_of_neighbors': 10, 'bias': 0.010099998000000002, 'hist_len': 10, 'reciprocal_ranking': False, 'normalize_scores': False, 'cache_users_states': True}
[2025-04-25 03:36:20,324] [automlx.trials] Model Tuning completed. Took: 3.371 secs
[2025-04-25 03:36:20,456] [automlx.trials] Running Model Tuning for ['AlsRecommender']
[2025-04-25 03:36:23,155] [automlx.trials] Best parameters for AlsRecommender: {'n_recommendations': 10, 'iterations': 10, 'factors': 16, 'regularization': 0.00044721247746457157, 'cache_users_states': True}
[2025-04-25 03:36:23,156] [automlx.trials] Model Tuning completed. Took: 2.700 secs
[2025-04-25 03:36:23,552] [automlx.interface] Re-fitting pipeline
[2025-04-25 03:36:23,562] [automlx.final_fit] Skipping updating parameter seed, already fixed by FinalFit_22bd3002-a
[2025-04-25 03:36:25,050] [automlx.interface] AutoMLx completed.


custom_pipeline.recommend(subjects=recommendation_subjects, n_recommendations=5)


get_ipython().run_line_magic('precision', '4')
custom_pipeline.score(data=test_data, score_metric="ndcg")

0.0300

	user_id	movie_id	rating	timestamp
43660	508	185	5	883777430
87278	518	742	5	876823804
14317	178	28	5	882826806
81932	899	291	4	884122279
95321	115	117	4	881171009

	user_id	movie_id	rating
timestamp
883777430	508	185	5
876823804	518	742	5
882826806	178	28	5
884122279	899	291	4
881171009	115	117	4

	user_id	movie_id	score
0	628	330	15.370814
1	628	286	15.029380
2	628	258	14.119169
3	628	272	13.703196
4	628	313	13.564656

Step	# Samples	# Features	Algorithm	Hyperparameters	Score (SanerecMetric)	All Metrics	Runtime (Seconds)	Memory Usage (GB)	Finished
Model Selection	48114	2	ItemKNNRecommender	{'n_recommendations': 10, 'num_of_neighbors': 100, 'bias': 25, 'hist_len': 20, 'reciprocal_ranking': False, 'normalize_scores': False, 'cache_users_states': True}	0.0882	{'hr': 0.08820403825717323}	1.0319	0.7063	Fri Apr 25 03:35:20 2025
Model Selection	48114	2	AlsRecommender	{'n_recommendations': 10, 'iterations': 10, 'factors': 16, 'regularization': 0.01, 'cache_users_states': True}	0.0691	{'hr': 0.06907545164718384}	4.5845	0.7055	Fri Apr 25 03:35:19 2025
Model Selection	48114	2	TRexxRecommender	{'n_recommendations': 10, 'embedding_dim': 32, 'sequence_length': 5, 'num_sampled': 100, 'dropout_rate': 0.2, 'num_blocks': 2, 'num_head': 4, 'l2_reg_embedding': 1e-06, 'dnn_activation': 'tanh', 'optimizer_name': 'lazyadam', 'optimizer_learning_rate': 0.01, 'future_blinding': False, 'embeddings_on_cpu': False, 'cache_users_states': False, 'negative_sampling_method': CandidateSamplingMethod.UNIFORM_CANDIDATE_SAMPLING, 'epochs': 10, 'batch_size': 512, 'verbose': 1, 'augment_data': True, 'early_stopping_patience': -1}	0.0531	{'hr': 0.053134962805526036}	35.1017	1.1860	Fri Apr 25 03:35:56 2025
Model Selection	48114	2	BprRecommender	{'n_recommendations': 10, 'iterations': 10, 'factors': 16, 'regularization': 0.01, 'cache_users_states': True}	0.0372	{'hr': 0.03719447396386823}	0.4148	0.7046	Fri Apr 25 03:35:21 2025
Model Tuning	48114	2	ItemKNNRecommender	{'n_recommendations': 10, 'num_of_neighbors': 505, 'bias': 0.0001, 'hist_len': 10, 'reciprocal_ranking': False, 'normalize_scores': False, 'cache_users_states': True}	0.0999	{'hr': 0.09989373007438895}	1.3955	0.6848	Fri Apr 25 03:36:10 2025
Model Tuning	48114	2	ItemKNNRecommender	{'n_recommendations': 10, 'num_of_neighbors': 506, 'bias': 0.0001, 'hist_len': 10, 'reciprocal_ranking': False, 'normalize_scores': False, 'cache_users_states': True}	0.0999	{'hr': 0.09989373007438895}	1.0910	0.6842	Fri Apr 25 03:36:12 2025
Model Tuning	48114	2	ItemKNNRecommender	{'n_recommendations': 10, 'num_of_neighbors': 506, 'bias': 0.0001, 'hist_len': 10, 'reciprocal_ranking': False, 'normalize_scores': False, 'cache_users_states': True}	0.0999	{'hr': 0.09989373007438895}	1.3582	0.6869	Fri Apr 25 03:36:09 2025
Model Tuning	48114	2	ItemKNNRecommender	{'n_recommendations': 10, 'num_of_neighbors': 10, 'bias': 28.25660795027468, 'hist_len': 10, 'reciprocal_ranking': False, 'normalize_scores': False, 'cache_users_states': True}	0.0956	{'hr': 0.09564293304994687}	1.4462	0.6762	Fri Apr 25 03:36:08 2025
Model Tuning	48114	2	ItemKNNRecommender	{'n_recommendations': 10, 'num_of_neighbors': 10, 'bias': 28.26160794927468, 'hist_len': 10, 'reciprocal_ranking': False, 'normalize_scores': False, 'cache_users_states': True}	0.0956	{'hr': 0.09564293304994687}	1.1324	0.6785	Fri Apr 25 03:36:10 2025
Model Tuning	48114	2	ItemKNNRecommender	{'n_recommendations': 10, 'num_of_neighbors': 10, 'bias': 28.26160794927468, 'hist_len': 10, 'reciprocal_ranking': False, 'normalize_scores': False, 'cache_users_states': True}	0.0956	{'hr': 0.09564293304994687}	1.3547	0.6758	Fri Apr 25 03:36:08 2025
...	...	...	...	...	...	...	...	...	...
Model Tuning	48114	2	ItemKNNRecommender	{'n_recommendations': 10, 'num_of_neighbors': 10, 'bias': 0.0001, 'hist_len': 20, 'reciprocal_ranking': False, 'normalize_scores': False, 'cache_users_states': True}	0.084	{'hr': 0.08395324123273114}	1.3191	0.6742	Fri Apr 25 03:36:08 2025
Model Tuning	48114	2	ItemKNNRecommender	{'n_recommendations': 10, 'num_of_neighbors': 10, 'bias': 0.0001, 'hist_len': 21, 'reciprocal_ranking': False, 'normalize_scores': False, 'cache_users_states': True}	0.084	{'hr': 0.08395324123273114}	1.3325	0.6755	Fri Apr 25 03:36:08 2025
Model Tuning	48114	2	ItemKNNRecommender	{'n_recommendations': 10, 'num_of_neighbors': 505, 'bias': 25, 'hist_len': 20, 'reciprocal_ranking': False, 'normalize_scores': False, 'cache_users_states': True}	0.0829	{'hr': 0.08289054197662062}	1.1355	1.1758	Fri Apr 25 03:36:01 2025
Model Tuning	48114	2	ItemKNNRecommender	{'n_recommendations': 10, 'num_of_neighbors': 506, 'bias': 25, 'hist_len': 20, 'reciprocal_ranking': False, 'normalize_scores': False, 'cache_users_states': True}	0.0829	{'hr': 0.08289054197662062}	1.3934	1.1758	Fri Apr 25 03:36:02 2025
Model Tuning	48114	2	ItemKNNRecommender	{'n_recommendations': 10, 'num_of_neighbors': 752, 'bias': 25, 'hist_len': 20, 'reciprocal_ranking': False, 'normalize_scores': False, 'cache_users_states': True}	0.0818	{'hr': 0.0818278427205101}	1.4294	1.1812	Fri Apr 25 03:36:05 2025
Model Tuning	48114	2	ItemKNNRecommender	{'n_recommendations': 10, 'num_of_neighbors': 753, 'bias': 25, 'hist_len': 20, 'reciprocal_ranking': False, 'normalize_scores': False, 'cache_users_states': True}	0.0818	{'hr': 0.0818278427205101}	1.2525	1.1812	Fri Apr 25 03:36:04 2025
Model Tuning	48114	2	ItemKNNRecommender	{'n_recommendations': 10, 'num_of_neighbors': 10, 'bias': 0.0001, 'hist_len': 132, 'reciprocal_ranking': False, 'normalize_scores': False, 'cache_users_states': True}	0.0797	{'hr': 0.07970244420828905}	1.4449	0.6758	Fri Apr 25 03:36:09 2025
Model Tuning	48114	2	ItemKNNRecommender	{'n_recommendations': 10, 'num_of_neighbors': 10, 'bias': 0.0001, 'hist_len': 133, 'reciprocal_ranking': False, 'normalize_scores': False, 'cache_users_states': True}	0.0797	{'hr': 0.07970244420828905}	1.3044	0.6773	Fri Apr 25 03:36:08 2025
Model Tuning	48114	2	ItemKNNRecommender	{'n_recommendations': 10, 'num_of_neighbors': 10, 'bias': 0.0001, 'hist_len': 255, 'reciprocal_ranking': False, 'normalize_scores': False, 'cache_users_states': True}	0.0797	{'hr': 0.07970244420828905}	1.4471	0.6760	Fri Apr 25 03:36:09 2025
Model Tuning	48114	2	ItemKNNRecommender	{'n_recommendations': 10, 'num_of_neighbors': 10, 'bias': 0.0001, 'hist_len': 256, 'reciprocal_ranking': False, 'normalize_scores': False, 'cache_users_states': True}	0.0797	{'hr': 0.07970244420828905}	1.4831	0.6730	Fri Apr 25 03:36:09 2025

	user_id	movie_id	score
0	628	286	13.964525
1	628	330	13.761287
2	628	272	12.332191
3	628	331	12.210435
4	628	313	12.163628

Building a Recommender using AutoMLx

Overview of this Notebook¶

Prerequisites:¶

Business Use:¶

Table of Contents¶

Setup¶

Load Movielens 100k data¶

Define types of columns in the dataframe¶

Splitting the dataset¶

AutoML¶

Create an instance of Oracle AutoMLx¶

Train a model using AutoMLx¶

Generate recommendations¶

Analyze the AutoMLx optimization process¶

Algorithm Selection¶

Hyperparameter Tuning¶

Advanced AutoMLx Configuration¶

Use a custom validation set¶

Final evaluation of the best model¶