Source code for simpleml.datasets.numpy.base

"""
Numpy Module for external "dataframe"

Inherit and extend for particular patterns. It is a bit of a misnomer to use the
term "dataframe", since there are very few expected attributes and they are by no
means unique to pandas.
"""

[docs]__author__ = "Elisha Yadgaran"
import logging from typing import Any, List import numpy as np from simpleml.datasets.base_dataset import Dataset from simpleml.pipelines.validation_split_mixins import Split from simpleml.utils.errors import DatasetError
[docs]LOGGER = logging.getLogger(__name__)
[docs]class BaseNumpyDataset(Dataset): """ Assumes self.dataframe is a dictionary of numpy ndarrays """ # TODO: rewrite class to index native numpy array directly # Considered unstable for the time being def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) LOGGER.warning( "Numpy datasets are currently unstable - usage is discouraged as breaking changes may be introduced" ) @property
[docs] def X(self) -> np.ndarray: """ Return the subset that isn't in the target labels """ return self.get(column="X", split=None)
@property
[docs] def y(self) -> np.ndarray: """ Return the target label columns """ return self.get(column="y", split=None)
[docs] def get(self, column: str, split: str) -> np.ndarray: """ Explicitly split validation splits Assumes self.dataframe has a get method to return a dictionary of {'X': X, 'y': y} Uses self.label_columns if y is named something else -- only looks at first entry in list returns None for any combination of column/split that isn't present """ if column not in ("X", "y"): raise ValueError("Only support columns: X & y") if split is None: # Assumes there is no top level split split_dict = self.dataframe else: split_dict = self.dataframe.get(split) if split_dict is None: split_dict = {} # Make compatible with return syntax if column == "y": return split_dict.get(self.label_columns[0], None) else: return split_dict.get("X", None)
[docs] def get_split_names(self) -> List[str]: """ Helper to expose the splits contained in the dataset """ # assumes dict like container return list(self.dataframe.keys())
[docs] def get_feature_names(self) -> List[str]: """ Should return a list of the features in the dataset """ return ["X"]