Source code for simpleml.datasets.pandas.base

"""
Pandas Module for datasets

Inherit and extend for particular patterns
"""

[docs]__author__ = "Elisha Yadgaran"

from itertools import chain
from typing import List, Optional, Union

import pandas as pd

from simpleml.datasets.base_dataset import Dataset
from simpleml.pipelines.validation_split_mixins import Split
from simpleml.utils.errors import DatasetError

[docs]DATAFRAME_SPLIT_COLUMN: str = "DATASET_SPLIT"


[docs]class BasePandasDataset(Dataset):
    """
    Pandas base class with control mechanism for `self.dataframe` of
    type `pd.Dataframe`
    """

    def __init__(self, squeeze_return: bool = False, **kwargs):
        """
        :param squeeze_return: boolean flag whether to run dataframe.squeeze() on
            return from self.get() calls. Particularly necessary to align input
            types with different libraries (e.g. sklearn y with single label)
        """
        super().__init__(**kwargs)
        self.config["squeeze_return"] = squeeze_return

    @property
[docs]    def X(self) -> pd.DataFrame:
        """
        Return the subset that isn't in the target labels (across all potential splits)
        """
        return self.get(column="X", split=None)

    @property
[docs]    def y(self) -> pd.DataFrame:
        """
        Return the target label columns
        """
        return self.get(column="y", split=None)

    @property
[docs]    def _dataframe(self) -> pd.DataFrame:
        """
        Overwrite base behavior to return a copy of the data in case consumers
        attempt to mutate the data structure

        Only copies the pandas container - underlying cell objects can still propagate
        inplace mutations (eg lists, dicts, objects)
        """
        # return a copy so mutations can happen inplace with memory efficient objects
        return self._external_file.copy()

    @_dataframe.setter
    def _dataframe(self, df: pd.DataFrame) -> None:
        """
        Setter method for self._external_file
        Allows mixins/subclasses to validate input
        """
        self._external_file = df

[docs]    def _validate_dtype(self, df: pd.DataFrame) -> None:
        """
        Validating setter method for self._external_file
        Checks input is of type pd.DataFrame
        """
        if not isinstance(df, pd.DataFrame):
            raise DatasetError("Pandas Datasets must be of type `pd.DataFrame`")

[docs]    def get(self, column: Optional[str], split: Optional[str]) -> pd.DataFrame:
        """
        Explicitly split validation splits
        Uses self.label_columns to separate x and y columns inside the returned dataframe

        returns empty dataframe for missing combinations of column & split
        """
        registered_sections = self.config.get("split_section_map")
        squeeze_return = self.config.get("squeeze_return")

        if column is not None and column != "X" and column not in registered_sections:
            raise ValueError(
                f"Only support registered sections: {registered_sections}, X, or None"
            )

        dataframe = self.dataframe  # copy

        # choose the columns to slice from the dataframe
        if column is None:  # All except internal columns
            return_columns = [
                col for col in dataframe.columns if col != DATAFRAME_SPLIT_COLUMN
            ]

        elif column != "X":
            # other passthrough columns
            return_columns = registered_sections[column]

        else:  # X
            all_other_columns = list(chain(*registered_sections.values()))
            return_columns = [
                col
                for col in dataframe.columns
                if col != DATAFRAME_SPLIT_COLUMN and col not in all_other_columns
            ]

        result = self._get(dataframe=dataframe, columns=return_columns, split=split)
        if squeeze_return:
            return self.squeeze_dataframe(result)
        else:
            return result

    @staticmethod
[docs]    def _get(dataframe: pd.DataFrame, columns: List[str], split: str) -> pd.DataFrame:
        """
        Internal method to extract data subsets from a dataframe

        :param dataframe: the dataframe to subset from
        :param columns: List of columns to slice from the dataframe
        :param split: row identifiers to slice rows (in internal column mapped to `DATAFRAME_SPLIT_COLUMN`)
        """
        if split is not None:  # Return the full dataset (all splits) - already a copy
            # query automatically returns a copy wisth a weakref
            if DATAFRAME_SPLIT_COLUMN not in dataframe.columns:
                raise DatasetError(
                    "Cannot retrieve dataset split `{split}` from dataframe without `{DATAFRAME_SPLIT_COLUMN}` column"
                )
            dataframe = dataframe.query(
                "{}=='{}'".format(DATAFRAME_SPLIT_COLUMN, split)
            )

        # inplace drop extra columns
        drop_columns = [col for col in dataframe.columns if col not in columns]
        if drop_columns:
            dataframe.drop(drop_columns, axis=1, inplace=True)

        # Last check in case any of the operations created a view or weakref copy
        if (hasattr(dataframe, "_is_view") and dataframe._is_view) or (
            hasattr(dataframe, "_is_copy") and dataframe._is_copy is not None
        ):
            dataframe = dataframe.copy()

        return dataframe

[docs]    def get_split(self, split: Optional[str]) -> Split:
        """
        Wrapper accessor to return a split object (for internal use)
        """
        registered_sections = self.config.get("split_section_map")
        return Split(
            # explicitly get X as the "other" columns
            X=self.get(column="X", split=split),
            # should include y and any others if they exist
            **{
                section: self.get(split=split, column=section)
                for section in registered_sections
            },
        ).squeeze()

[docs]    def get_split_names(self) -> List[str]:
        """
        Helper to expose the splits contained in the dataset
        """
        df = self.dataframe
        if DATAFRAME_SPLIT_COLUMN in df.columns:
            return df[DATAFRAME_SPLIT_COLUMN].unique().tolist()
        else:
            return []

[docs]    def get_feature_names(self) -> List[str]:
        """
        Should return a list of the features in the dataset
        """
        return self.X.columns.tolist()

    """
    Generic Pandas Helper Utils
    """

    @staticmethod
[docs]    def concatenate_dataframes(
        dataframes: List[pd.DataFrame], split_names: List[str]
    ) -> pd.DataFrame:
        """
        Helper method to merge dataframes into a single one with the split
        specified under `DATAFRAME_SPLIT_COLUMN`
        """
        for df, name in zip(dataframes, split_names):
            df[DATAFRAME_SPLIT_COLUMN] = name

        # Join row wise - drop index in case duplicates exist
        return pd.concat(dataframes, axis=0, ignore_index=True)

    @staticmethod
[docs]    def merge_split(split: Split) -> pd.DataFrame:
        """
        Helper method to merge all dataframes in a split object into a single df
        does a column-wise join
        ex: `df1 = [A, B, C](4 rows)` + `df2 = [D, E, F](4 rows)`
        returns: `[A, B, C, D, E, F](4 rows)`
        """
        return pd.concat(list(split.values()), axis=1)

    @staticmethod
[docs]    def squeeze_dataframe(df: pd.DataFrame) -> pd.Series:
        """
        Helper method to run dataframe squeeze and return a series
        """
        return df.squeeze(axis=1)