Skip to content

Evaluation Metrics

omnirec.runner.evaluation.Evaluator(*metrics: Metric)

Initialize the Evaluator with metrics to compute on predictions. The Evaluator computes specified metrics on algorithm predictions and accumulates results across experiments. Use get_tables() to retrieve formatted result tables.

Parameters:

Name Type Description Default
*metrics Metric

One or more metric instances to compute. Common metrics include NDCG, HR (Hit Rate), and Recall. Each metric can be configured with multiple k values (e.g., NDCG([5, 10, 20])).

()
Source code in src\omnirec\runner\evaluation.py
def __init__(self, *metrics: Metric) -> None:
    """Initialize the Evaluator with metrics to compute on predictions.
    The Evaluator computes specified metrics on algorithm predictions and accumulates
    results across experiments. Use `get_tables()` to retrieve formatted result tables.

    Args:
        *metrics (Metric): One or more metric instances to compute. Common metrics include
            NDCG, HR (Hit Rate), and Recall. Each metric can be configured with multiple
            k values (e.g., `NDCG([5, 10, 20])`).
    """
    if not isinstance(metrics, Iterable):
        metrics = [metrics]
    self._metrics = metrics
    self._results: dict[str, DataFrame] = {}

get_results() -> dict[str, DataFrame]

Return evaluation results grouped by dataset.

Returns:

Type Description
dict[str, DataFrame]

dict[str, DataFrame]: Mapping of dataset identifiers to their result tables. Keys are dataset names with a unique hash appended. Each value is a DataFrame containing the columns:

  • "algorithm": algorithm identifier (name with config hash appended)
  • "fold": cross-validation fold index, or None if not using CV
  • "name": metric name
  • "k": cutoff for ranking metrics (e.g., NDCG@k), or None for non-ranking metrics (e.g., RMSE)
  • "value": metric value
Source code in src\omnirec\runner\evaluation.py
def get_results(self) -> dict[str, DataFrame]:
    """Return evaluation results grouped by dataset.

    Returns:
        dict[str, DataFrame]:
            Mapping of dataset identifiers to their result tables. Keys are dataset
            names with a unique hash appended. Each value is a DataFrame containing
            the columns:

            - "algorithm": algorithm identifier (name with config hash appended)
            - "fold": cross-validation fold index, or None if not using CV
            - "name": metric name
            - "k": cutoff for ranking metrics (e.g., NDCG@k), or None for non-ranking metrics (e.g., RMSE)
            - "value": metric value
    """
    return self._results

get_tables() -> list[Table]

Return evaluation results as formatted Rich tables, one per dataset.

Each table has one row per algorithm (and per fold when cross-validation is used) and one column per metric+k combination (e.g. NDCG@10). The tables are automatically printed to the console by :func:~omnirec.util.run.run_omnirec after all experiments complete, so you only need to call this method directly if you want to redisplay results (e.g. after :meth:load_results).

Returns:

Type Description
list[Table]

list[rich.table.Table]: One Rich Table per dataset.

Example
from rich.console import Console
console = Console()
for table in evaluator.get_tables():
    console.print(table)
Source code in src\omnirec\runner\evaluation.py
def get_tables(self) -> list[Table]:
    """Return evaluation results as formatted Rich tables, one per dataset.

    Each table has one row per algorithm (and per fold when cross-validation is used)
    and one column per metric+k combination (e.g. ``NDCG@10``). The tables are
    automatically printed to the console by :func:`~omnirec.util.run.run_omnirec`
    after all experiments complete, so you only need to call this method directly
    if you want to redisplay results (e.g. after :meth:`load_results`).

    Returns:
        list[rich.table.Table]: One Rich ``Table`` per dataset.

    Example:
        ```python
        from rich.console import Console
        console = Console()
        for table in evaluator.get_tables():
            console.print(table)
        ```
    """
    tables: list[Table] = []

    for dataset, results_df in self._results.items():
        df = results_df.copy()
        # Combine name and k only if k is not None
        df["name"] = df.apply(
            lambda r: f"{r['name']}@{r['k']}" if pd.notna(r["k"]) else r["name"],
            axis=1,
        )

        # Only keep fold if it actually varies
        if df["fold"].nunique() > 1:
            index_cols = ["algorithm", "fold"]
        else:
            index_cols = ["algorithm"]

        df_pivot = (
            df.pivot(index=index_cols, columns="name", values="value")
            .reset_index()
            .sort_values(index_cols)
        )

        table = Table(title=f"{dataset}: Evaluation Results")
        for col in df_pivot.columns:
            col_str = str(col)
            if col_str in ["algorithm", "fold"]:
                col_str = col_str.capitalize()
            table.add_column(
                col_str, style=ColumnStyles.get(col), footer="test footer"
            )

        prev_alg, prev_fold = None, None
        for _, row in df_pivot.iterrows():
            alg = row["algorithm"] if row["algorithm"] != prev_alg else ""
            row_values = [alg]
            if "fold" in row:
                fold = (
                    str(row["fold"])
                    if row["fold"] != prev_fold or alg != ""
                    else ""
                )
                row_values.append(fold)
                prev_fold = row["fold"]
                start_col = 2
            else:
                start_col = 1
            prev_alg = row["algorithm"]
            row_values.extend(
                [str(row[col]) for col in df_pivot.columns[start_col:]]
            )
            table.add_row(*row_values)

        tables.append(table)

    return tables

save_results(path: Path)

Persist evaluation results to a JSON file.

Serialises the internal results dictionary to JSON so that results can be reloaded later with :meth:load_results without re-running experiments.

Parameters:

Name Type Description Default
path Path

Destination file path. The file is created or overwritten.

required
Example
from pathlib import Path
evaluator.save_results(Path("results/my_experiment.json"))
Source code in src\omnirec\runner\evaluation.py
def save_results(self, path: Path):
    """Persist evaluation results to a JSON file.

    Serialises the internal results dictionary to JSON so that results can be
    reloaded later with :meth:`load_results` without re-running experiments.

    Args:
        path (Path): Destination file path. The file is created or overwritten.

    Example:
        ```python
        from pathlib import Path
        evaluator.save_results(Path("results/my_experiment.json"))
        ```
    """
    data = {k: v.to_dict("records") for k, v in self._results.items()}
    path.write_text(json.dumps(data))

load_results(path: Path)

Load previously saved evaluation results from a JSON file.

Restores results that were written by :meth:save_results. After loading, :meth:get_results and :meth:get_tables work as if the experiments had just finished.

Parameters:

Name Type Description Default
path Path

Path to a JSON file previously written by :meth:save_results.

required
Example
from pathlib import Path
from rich.console import Console

evaluator.load_results(Path("results/my_experiment.json"))

# Inspect raw DataFrames
for dataset_id, df in evaluator.get_results().items():
    print(df)

# Or redisplay the formatted tables
console = Console()
for table in evaluator.get_tables():
    console.print(table)
Source code in src\omnirec\runner\evaluation.py
def load_results(self, path: Path):
    """Load previously saved evaluation results from a JSON file.

    Restores results that were written by :meth:`save_results`. After loading,
    :meth:`get_results` and :meth:`get_tables` work as if the experiments had
    just finished.

    Args:
        path (Path): Path to a JSON file previously written by :meth:`save_results`.

    Example:
        ```python
        from pathlib import Path
        from rich.console import Console

        evaluator.load_results(Path("results/my_experiment.json"))

        # Inspect raw DataFrames
        for dataset_id, df in evaluator.get_results().items():
            print(df)

        # Or redisplay the formatted tables
        console = Console()
        for table in evaluator.get_tables():
            console.print(table)
        ```
    """
    js = json.loads(path.read_text())
    self._results = {k: pd.DataFrame(v) for k, v in js.items()}

Metric Base Classes

omnirec.metrics.base.Metric

Bases: ABC

omnirec.metrics.base.Metric.calculate(predictions: pd.DataFrame, test: pd.DataFrame) -> MetricResult abstractmethod

Source code in src\omnirec\metrics\base.py
@abstractmethod
def calculate(
    self, predictions: pd.DataFrame, test: pd.DataFrame
) -> MetricResult: ...

omnirec.metrics.base.MetricResult(name: str, result: float | dict[int, float]) dataclass

Represents the result of a metric calculation. It holds the name as str and either a single float result or a dictionary of results for multiple k values.

Ranking Metrics

omnirec.metrics.ranking.HR(k: int | list[int])

Bases: RankingMetric

Computes the HR metric. k is the number of top recommendations to consider. It can be a single integer or a list of integers, in which case the metric will be computed for each value of k.

It follows the formula:

\(HR@k = \frac{1}{|U|} \sum_{u \in U} \mathbf{1}\{\text{Rel}(u) \cap \text{Pred}_k(u) \neq \emptyset\}\)

where \(\text{Pred}_k(u)\) is the set of top-k predicted items for user u.

Parameters:

Name Type Description Default
k int | list[int]

The number of top recommendations to consider.

required
Source code in src\omnirec\metrics\ranking.py
def __init__(self, k: int | list[int]) -> None:
    """
    Computes the HR metric. k is the number of top recommendations to consider.
    It can be a single integer or a list of integers, in which case the metric will be computed for each value of k.

    It follows the formula:

    $HR@k = \\frac{1}{|U|} \\sum_{u \\in U} \\mathbf{1}\\{\\text{Rel}(u) \\cap \\text{Pred}_k(u) \\neq \\emptyset\\}$

    where $\\text{Pred}_k(u)$ is the set of top-k predicted items for user u.

    Args:
        k (int | list[int]): The number of top recommendations to consider.
    """
    super().__init__(k)

calculate(predictions: DataFrame, test: DataFrame) -> MetricResult

Calculates the Hit Rate (HR) metric. Considers the top-k predictions for one or multiple k values.

Parameters:

Name Type Description Default
predictions DataFrame

Contains the top k predictions for one or more users.

required
test DataFrame

Contains the ground truth relevant items for one or more users.

required

Returns:

Name Type Description
MetricResult MetricResult

The computed HR scores for each value k. If multiple users are provided, the scores are averaged.

Source code in src\omnirec\metrics\ranking.py
def calculate(self, predictions: DataFrame, test: DataFrame) -> MetricResult:
    """Calculates the Hit Rate (HR) metric. Considers the top-k predictions for one or multiple k values.

    Args:
        predictions (DataFrame): Contains the top k predictions for one or more users.
        test (DataFrame): Contains the ground truth relevant items for one or more users.

    Returns:
        MetricResult: The computed HR scores for each value k. If multiple users are provided, the scores are averaged.
    """
    top_k_dict = self.make_topk_dict(predictions)

    hr_per_user_per_k: dict[int, list] = {}
    # FIXME: Fix metric implementation, adapt to new data format
    for user, (pred, _) in top_k_dict.items():
        positive_test_interactions = test["item"][test["user"] == user].to_numpy()
        hits = np.isin(pred[: max(self._k_list)], positive_test_interactions)
        for k in self._k_list:
            user_hr = hits[:k].sum()
            user_hr = 1 if user_hr > 0 else 0
            hr_per_user_per_k.setdefault(k, []).append(user_hr)
    scores: list[float] = [sum(v) / len(v) for v in hr_per_user_per_k.values()]
    scores_dict = {k: score for k, score in zip(self._k_list, scores)}
    return MetricResult(__class__.__name__, scores_dict)

omnirec.metrics.ranking.NDCG(k: int | list[int])

Bases: RankingMetric

Initializes the NDCG (Normalized Discounted Cumulative Gain) metric. k is the number of top predictions to consider. It can be a single integer or a list of integers, in which case the metric will be computed for each value of k.

The NDCG considers the position of relevant items in a ranked list of predictions.

For a user u, the discounted cumulative gain at cutoff k is

\(DCG@k(u) = \sum_{i=1}^{k} \frac{\mathbf{1}\{\text{pred}_i \in \text{Rel}(u)\}}{\log_2(i+1)}\)

where \(\mathbf{1}\{\cdot\}\) is the indicator function and

\(\text{Rel}(u)\) is the set of relevant items for user u.

The ideal discounted cumulative gain is

\(IDCG@k = \sum_{i=1}^{k} \frac{1}{\log_2(i+1)}\)

The normalized score is

\(NDCG@k(u) = \frac{DCG@k(u)}{IDCG@k}\)

Finally, the reported score is averaged over all users:

\(\text{NDCG@k} = \frac{1}{|U|} \sum_{u \in U} NDCG@k(u)\)

Parameters:

Name Type Description Default
k int | list[int]

The number of top predictions to consider.

required
Source code in src\omnirec\metrics\ranking.py
def __init__(self, k: int | list[int]) -> None:
    """Initializes the NDCG (Normalized Discounted Cumulative Gain) metric. k is the number of top predictions to consider.
    It can be a single integer or a list of integers, in which case the metric will be computed for each value of k.

    The NDCG considers the position of relevant items in a ranked list of predictions.

    For a user u, the discounted cumulative gain at cutoff k is

    $DCG@k(u) = \\sum_{i=1}^{k} \\frac{\\mathbf{1}\\{\\text{pred}_i \\in \\text{Rel}(u)\\}}{\\log_2(i+1)}$

    where $\\mathbf{1}\\{\\cdot\\}$ is the indicator function and

    $\\text{Rel}(u)$ is the set of relevant items for user u.

    The ideal discounted cumulative gain is

    $IDCG@k = \\sum_{i=1}^{k} \\frac{1}{\\log_2(i+1)}$

    The normalized score is

    $NDCG@k(u) = \\frac{DCG@k(u)}{IDCG@k}$

    Finally, the reported score is averaged over all users:

    $\\text{NDCG@k} = \\frac{1}{|U|} \\sum_{u \\in U} NDCG@k(u)$

    Args:
        k (int | list[int]): The number of top predictions to consider.
    """
    super().__init__(k)

calculate(predictions: DataFrame, test: DataFrame) -> MetricResult

Computes the Normalized Discounted Cumulative Gain (NDCG). Considers the top-k predictions for one or multiple k values.

Parameters:

Name Type Description Default
predictions DataFrame

Contains the top k predictions for one or more users.

required
test DataFrame

Contains the ground truth relevant items for one or more users.

required

Returns:

Name Type Description
MetricResult MetricResult

The computed NDCG scores for each value k. If multiple users are provided, the scores are averaged.

Source code in src\omnirec\metrics\ranking.py
def calculate(self, predictions: DataFrame, test: DataFrame) -> MetricResult:
    """Computes the Normalized Discounted Cumulative Gain (NDCG). Considers the top-k predictions for one or multiple k values.

    Args:
        predictions (DataFrame): Contains the top k predictions for one or more users.
        test (DataFrame): Contains the ground truth relevant items for one or more users.

    Returns:
        MetricResult: The computed NDCG scores for each value k. If multiple users are provided, the scores are averaged.
    """
    top_k_dict = self.make_topk_dict(predictions)

    discounted_gain_per_k = np.array(
        [1 / np.log2(i + 1) for i in range(1, max(self._k_list) + 1)]
    )
    ideal_discounted_gain_per_k = [
        discounted_gain_per_k[: ind + 1].sum()
        for ind in range(len(discounted_gain_per_k))
    ]
    ndcg_per_user_per_k: dict[int, list] = {}
    for user, (pred, _) in top_k_dict.items():
        positive_test_interactions = test["item"][test["user"] == user].to_numpy()
        hits = np.isin(pred[: max(self._k_list)], positive_test_interactions)
        user_dcg = np.where(hits, discounted_gain_per_k[: len(hits)], 0)
        for k in self._k_list:
            user_ndcg = user_dcg[:k].sum() / ideal_discounted_gain_per_k[k - 1]
            ndcg_per_user_per_k.setdefault(k, []).append(user_ndcg)

    scores: list[float] = [
        float(sum(v)) / len(v) for v in ndcg_per_user_per_k.values()
    ]
    scores_dict = {k: score for k, score in zip(self._k_list, scores)}
    return MetricResult(__class__.__name__, scores_dict)

omnirec.metrics.ranking.Recall(k: int | list[int])

Bases: RankingMetric

Calculates the average recall at k for one or multiple k values. Recall at k is defined as the proportion of relevant items that are found in the top-k recommendations.

It follows the formula:

\(Recall@k = \frac{1}{|U|} \sum_{u \in U} \frac{|\text{Rel}(u) \cap \text{Pred}_k(u)|}{\min(|\text{Rel}(u)|, k)}\)

where \(\text{Pred}_k(u)\) is the set of top-k predicted items for user u.

Parameters:

Name Type Description Default
k int | list[int]

The number of top recommendations to consider.

required
Source code in src\omnirec\metrics\ranking.py
def __init__(self, k: int | list[int]) -> None:
    """Calculates the average recall at k for one or multiple k values. Recall at k is defined as the proportion of relevant items that are found in the top-k recommendations.

    It follows the formula:

    $Recall@k = \\frac{1}{|U|} \\sum_{u \\in U} \\frac{|\\text{Rel}(u) \\cap \\text{Pred}_k(u)|}{\\min(|\\text{Rel}(u)|, k)}$

    where $\\text{Pred}_k(u)$ is the set of top-k predicted items for user u.

    Args:
        k (int | list[int]): The number of top recommendations to consider.
    """
    super().__init__(k)

calculate(predictions: DataFrame, test: DataFrame) -> MetricResult

Calculates the Recall metric. Considers the top-k predictions for one or multiple k values.

Parameters:

Name Type Description Default
predictions DataFrame

Contains the top k predictions for one or more users.

required
test DataFrame

Contains the ground truth relevant items for one or more users.

required

Returns:

Type Description
MetricResult

list[float]: The computed Recall scores for each value k. If multiple users are provided, the scores are averaged.

Source code in src\omnirec\metrics\ranking.py
def calculate(self, predictions: DataFrame, test: DataFrame) -> MetricResult:
    """Calculates the Recall metric. Considers the top-k predictions for one or multiple k values.

    Args:
        predictions (DataFrame): Contains the top k predictions for one or more users.
        test (DataFrame): Contains the ground truth relevant items for one or more users.

    Returns:
        list[float]: The computed Recall scores for each value k. If multiple users are provided, the scores are averaged.
    """
    top_k_dict = self.make_topk_dict(predictions)

    recall_per_user_per_k: dict[int, list] = {}
    for user, (pred, _) in top_k_dict.items():
        positive_test_interactions = test["item"][test["user"] == user].to_numpy()
        hits = np.isin(pred[: max(self._k_list)], positive_test_interactions)
        for k in self._k_list:
            user_recall = hits[:k].sum() / min(len(positive_test_interactions), k)
            recall_per_user_per_k.setdefault(k, []).append(user_recall)
    scores: list[float] = [
        float(sum(v)) / len(v) for v in recall_per_user_per_k.values()
    ]
    scores_dict = {k: score for k, score in zip(self._k_list, scores)}
    return MetricResult(__class__.__name__, scores_dict)

Prediction Metrics

omnirec.metrics.prediction.MAE()

Bases: PredictionMetric

Mean Absolute Error (MAE) metric. Calculates the average of the absolute differences between predicted and actual ratings, according to the formula: \(MAE = \frac{1}{n} \sum_{i=1}^{n} |y_i - \hat{y}_i|\)

Source code in src\omnirec\metrics\prediction.py
def __init__(self) -> None:
    """Mean Absolute Error (MAE) metric. Calculates the average of the absolute differences between predicted and actual ratings, according to the formula:
    $MAE = \\frac{1}{n} \\sum_{i=1}^{n} |y_i - \\hat{y}_i|$
    """
    super().__init__()

calculate(predictions: DataFrame, test: DataFrame) -> MetricResult

Calculates the MAE metric.

Parameters:

Name Type Description Default
predictions DataFrame

Contains the predicted ratings.

required
test DataFrame

Contains the ground truth ratings.

required

Returns:

Name Type Description
MetricResult MetricResult

Contains the name of the metric and the computed MAE value.

Source code in src\omnirec\metrics\prediction.py
def calculate(self, predictions: DataFrame, test: DataFrame) -> MetricResult:
    """Calculates the MAE metric.

    Args:
        predictions (DataFrame): Contains the predicted ratings.
        test (DataFrame): Contains the ground truth ratings.

    Returns:
        MetricResult: Contains the name of the metric and the computed MAE value.
    """
    merged = self.merge(predictions, test)
    mae = mean_absolute_error(merged["rating_test"], merged["rating_pred"])
    return MetricResult(__class__.__name__, mae)

omnirec.metrics.prediction.RMSE()

Bases: PredictionMetric

Root Mean Squared Error (RMSE) metric. Calculates the square root of the average of the squared differences between predicted and actual ratings, according to the formula:

\(RMSE = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2}\)

Source code in src\omnirec\metrics\prediction.py
def __init__(self) -> None:
    """Root Mean Squared Error (RMSE) metric. Calculates the square root of the average of the squared differences between predicted and actual ratings, according to the formula:

    $RMSE = \\sqrt{\\frac{1}{n} \\sum_{i=1}^{n} (y_i - \\hat{y}_i)^2}$
    """
    super().__init__()

calculate(predictions: DataFrame, test: DataFrame) -> MetricResult

Calculate the RMSE metric.

Parameters:

Name Type Description Default
predictions DataFrame

description

required
test DataFrame

description

required

Returns:

Name Type Description
MetricResult MetricResult

Contains the name of the metric and the computed RMSE value.

Source code in src\omnirec\metrics\prediction.py
def calculate(self, predictions: DataFrame, test: DataFrame) -> MetricResult:
    """Calculate the RMSE metric.

    Args:
        predictions (DataFrame): _description_
        test (DataFrame): _description_

    Returns:
        MetricResult: Contains the name of the metric and the computed RMSE value.
    """
    merged = self.merge(predictions, test)
    rmse = root_mean_squared_error(merged["rating_test"], merged["rating_pred"])
    return MetricResult(__class__.__name__, rmse)