Source code for simpleml.datasets.dataset_splits

"""
Module for split transfer objects. Internal object to shuttle dataset slices
between persistables
"""

[docs]__author__ = "Elisha Yadgaran"
from collections import defaultdict from typing import Any import pandas as pd from simpleml.imports import ddDataFrame, ddSeries
[docs]class Split(dict): """ Container class for splits """
[docs] def __getattr__(self, attr): """ Default attribute processor (Used in combination with __getitem__ to enable ** syntax) """ return self.get(attr, None)
@staticmethod
[docs] def is_null_type(obj: Any) -> bool: """ Helper to check for nulls - useful to not pass "empty" attributes so defaults of None will get returned downstream instead ex: **split -> all non null named params """ # NoneType if obj is None: return True # Pandas objects if isinstance(obj, (pd.DataFrame, pd.Series)) and obj.empty: return True # Dask objects if isinstance(obj, (ddDataFrame, ddSeries)) and len(obj.index) == 0: return True # Empty built-ins - uses __nonzero__ if isinstance(obj, (list, tuple, dict)) and not obj: return True # Else return False
[docs] def squeeze(self): """ Helper method to clear up any null-type keys """ poppable_keys = [k for k, v in self.items() if self.is_null_type(v)] [self.pop(k) for k in poppable_keys] # Return self for easy chaining return self
[docs]class SplitContainer(defaultdict): """ Explicit instantiation of a defaultdict returning split objects """ def __init__(self, default_factory=Split, **kwargs): super(SplitContainer, self).__init__(default_factory, **kwargs)