Skip to content

Dataset Management

omnirec.recsys_data_set.RecSysDataSet(data: Optional[T] = None, meta: Optional[DatasetMeta] = None)

Bases: Generic[T]

Source code in src\omnirec\recsys_data_set.py
def __init__(
    self, data: Optional[T] = None, meta: Optional[DatasetMeta] = None
) -> None:
    self._lineage = []

    if data:
        self._data = data

    if meta is None:
        meta = DatasetMeta()
    self._meta = meta

meta: DatasetMeta property

Return a shallow copy of the dataset metadata.

lineage: tuple[Trace, ...] property

Return the recorded preprocessing lineage as a read-only snapshot.

format_lineage(details: bool = False) -> str

Render the dataset lineage in either compact or detailed form.

Source code in src\omnirec\recsys_data_set.py
def format_lineage(self, details: bool = False) -> str:
    """Render the dataset lineage in either compact or detailed form."""
    if not self._lineage:
        return "No preprocessing lineage recorded."

    if not details:
        return "\n".join(
            f"{index}. {trace!r}"
            for index, trace in enumerate(self._lineage, start=1)
        )

    return "\n\n".join(
        "\n".join((f"Step {index}", trace.format_details()))
        for index, trace in enumerate(self._lineage, start=1)
    )

format_details(include_lineage: bool = True, lineage_details: bool = False) -> str

Render a human-readable summary of the dataset and its provenance.

Source code in src\omnirec\recsys_data_set.py
def format_details(
    self, include_lineage: bool = True, lineage_details: bool = False
) -> str:
    """Render a human-readable summary of the dataset and its provenance."""
    lines = [f"RecSysDataSet: {self._meta.name}"]
    self._append_field(lines, "Variant", self._data_variant_name())
    self._append_field(lines, "Interactions", self._interaction_summary())
    self._append_field(lines, "Columns", self._column_summary())

    lines.append("  Metadata:")
    lines.extend(f"    {line}" for line in self._meta.format_details().splitlines())

    self._append_field(lines, "Lineage steps", len(self._lineage))
    if include_lineage:
        lines.append("  Lineage:")
        formatted_lineage = self.format_lineage(details=lineage_details)
        lines.extend(f"    {line}" for line in formatted_lineage.splitlines())

    return "\n".join(lines)

use_dataloader(data_set: DataSet | str, raw_dir: Optional[PathLike | str] = None, canon_path: Optional[PathLike | str] = None, force_download=False, force_canonicalize=False) -> RecSysDataSet[RawData] staticmethod

Loads a dataset using a registered DataLoader. If not already done the data set is downloaded and canonicalized. Canonicalization means duplicates are dropped, identifiers are normalized and the data is saved in a standardized format.

Parameters:

Name Type Description Default
data_set DataSet | str

The name of the dataset from the DataSet enum. Must be a registered DataLoader name.

required
raw_dir Optional[PathLike | str]

Target directory where the raw data is stored. If not provided, the data is downloaded to the default raw data directory (_DATA_DIR).

None
canon_path Optional[PathLike | str]

Path where the canonicalized data should be saved. If not provided, the data is saved to the default canonicalized data directory (_DATA_DIR / "canon").

None
force_download bool

If True, forces re-downloading of the raw data even if it already exists. Defaults to False.

False
force_canonicalize bool

If True, forces re-canonicalization of the data even if a canonicalized file exists. Defaults to False.

False

Returns:

Type Description
RecSysDataSet[RawData]

RecSysDataSet[RawData]: The loaded dataset in canonicalized RawData format.

Example
# Load the MovieLens 100K dataset using the registered DataLoader
# Download the raw data to the default directory and save the canonicalized data to the default path
dataset = RecSysDataSet.use_dataloader(data_set_name=DataSet.MovieLens100K)
Source code in src\omnirec\recsys_data_set.py
@staticmethod
def use_dataloader(
    data_set: DataSet | str,
    raw_dir: Optional[PathLike | str] = None,  # TODO: Name that right
    canon_path: Optional[PathLike | str] = None,  # TODO: Name that right
    force_download=False,
    force_canonicalize=False,
) -> "RecSysDataSet[RawData]":
    """Loads a dataset using a registered DataLoader. If not already done the data set is downloaded and canonicalized.
    Canonicalization means duplicates are dropped, identifiers are normalized and the data is saved in a standardized format.

    Args:
        data_set (DataSet | str): The name of the dataset from the DataSet enum. Must be a registered DataLoader name.
        raw_dir (Optional[PathLike | str], optional): Target directory where the raw data is stored. If not provided, the data is downloaded to the default raw data directory (_DATA_DIR).
        canon_path (Optional[PathLike | str], optional): Path where the canonicalized data should be saved. If not provided, the data is saved to the default canonicalized data directory (_DATA_DIR / "canon").
        force_download (bool, optional): If True, forces re-downloading of the raw data even if it already exists. Defaults to False.
        force_canonicalize (bool, optional): If True, forces re-canonicalization of the data even if a canonicalized file exists. Defaults to False.

    Returns:
        RecSysDataSet[RawData]: The loaded dataset in canonicalized RawData format.

    Example:
        ```Python
        # Load the MovieLens 100K dataset using the registered DataLoader
        # Download the raw data to the default directory and save the canonicalized data to the default path
        dataset = RecSysDataSet.use_dataloader(data_set_name=DataSet.MovieLens100K)
        ```
    """
    if isinstance(data_set, DataSet):
        data_set_name = data_set.value
    else:
        data_set_name = data_set
    dataset = RecSysDataSet[RawData]()

    dataset._meta.name = data_set_name

    if canon_path:
        dataset._meta.canon_pth = Path(canon_path)
    else:
        canon_dir = get_data_dir() / "canon"
        canon_dir.mkdir(parents=True, exist_ok=True)
        dataset._meta.canon_pth = (canon_dir / data_set_name).with_suffix(".csv")
    if dataset._meta.canon_pth.exists() and not (
        force_canonicalize or force_download
    ):
        logger.info(
            "Canonicalized data set already exists, skipping download and canonicalization."
        )
        dataset._data = RawData(pd.read_csv(dataset._meta.canon_pth))
        return dataset

    if raw_dir:
        dataset._meta.raw_dir = Path(raw_dir)

    dataset._data = RawData(
        registry._run_loader(data_set_name, force_download, dataset._meta.raw_dir)
    )
    dataset._canonicalize()
    return dataset

save(file: str | PathLike)

Saves the RecSysDataSet object to a file with the default suffix .rsds.

Parameters:

Name Type Description Default
file str | PathLike

The path where the file is saved.

required
Source code in src\omnirec\recsys_data_set.py
def save(self, file: str | PathLike):
    """Saves the RecSysDataSet object to a file with the default suffix .rsds.

    Args:
        file (str | PathLike): The path where the file is saved.
    """
    from omnirec.rsds.dispatcher import save_dataset

    file = Path(file)
    if not file.suffix:
        file = file.with_suffix(".rsds")

    save_dataset(self, file)

load(file: str | PathLike) -> RecSysDataSet[T] staticmethod

Loads a RecSysDataSet object from a file with the .rsds suffix.

Parameters:

Name Type Description Default
file str | PathLike

The path to the .rsds file.

required

Returns:

Type Description
RecSysDataSet[T]

RecSysDataSet[T]: The loaded RecSysDataSet object.

Source code in src\omnirec\recsys_data_set.py
@staticmethod
def load(file: str | PathLike) -> "RecSysDataSet[T]":
    """Loads a RecSysDataSet object from a file with the .rsds suffix.

    Args:
        file (str | PathLike): The path to the .rsds file.

    Returns:
        RecSysDataSet[T]: The loaded RecSysDataSet object.
    """
    from omnirec.rsds.dispatcher import load_dataset

    file = Path(file)

    ds = load_dataset(file)
    return cast(RecSysDataSet[T], ds)