Source code for simpleml.pipelines.validation_split_mixins

'''
Module for different split methods for cross validation

    1) No Split -- Just use all the data
    2) Explicit Split -- dataset class defines the split
    3) Percentage -- random split support for train, validation, test
    4) Chronological -- time based split support for train, validation, test
    5) KFold
'''

[docs]__author__ = 'Elisha Yadgaran'
from simpleml.constants import TRAIN_SPLIT, VALIDATION_SPLIT, TEST_SPLIT from abc import ABCMeta, abstractmethod from sklearn.model_selection import train_test_split from future.utils import with_metaclass from collections import defaultdict import pandas as pd
[docs]class Split(dict): ''' Container class for splits '''
[docs] def __getattr__(self, attr): ''' Default attribute processor (Used in combination with __getitem__ to enable ** syntax) ''' return self.get(attr, None)
@staticmethod
[docs] def is_null_type(obj): ''' Helper to check for nulls - useful to not pass "empty" attributes so defaults of None will get returned downstream instead ex: **split -> all non null named params ''' # NoneType if obj is None: return True # Pandas objects if isinstance(obj, (pd.DataFrame, pd.Series)) and obj.empty: return True # Empty built-ins - uses __nonzero__ if isinstance(obj, (list, tuple, dict)) and not obj: return True # Else return False
[docs] def squeeze(self): ''' Helper method to clear up any null-type keys ''' poppable_keys = [k for k, v in self.items() if self.is_null_type(v)] [self.pop(k) for k in poppable_keys] # Return self for easy chaining return self
[docs]class SplitContainer(defaultdict): ''' Explicit instantiation of a defaultdict returning split objects ''' def __init__(self, default_factory=Split, **kwargs): super(SplitContainer, self).__init__(default_factory, **kwargs)
[docs]class SplitMixin(with_metaclass(ABCMeta, object)): @abstractmethod
[docs] def split_dataset(self):
''' Set the split criteria Must set self._dataset_splits '''
[docs] def containerize_split(self, split_dict): return SplitContainer(**split_dict)
[docs] def get_split_names(self): if not hasattr(self, '_dataset_splits') or self._dataset_splits is None: self.split_dataset() return list(self._dataset_splits.keys())
[docs]class NoSplitMixin(SplitMixin):
[docs] def split_dataset(self): ''' Non-split mixin class. Returns full dataset for any split name ''' default_split = Split(X=self.dataset.X, y=self.dataset.y).squeeze() self._dataset_splits = self.containerize_split({ 'default_factory': lambda: default_split
})
[docs]class ExplicitSplitMixin(SplitMixin):
[docs] def split_dataset(self): ''' Method to split the dataframe into different sets. Assumes dataset explicitly delineates between train, validation, and test ''' self._dataset_splits = self.containerize_split({ TRAIN_SPLIT: Split(X=self.dataset.get('X', TRAIN_SPLIT), y=self.dataset.get('y', TRAIN_SPLIT)).squeeze(), VALIDATION_SPLIT: Split(X=self.dataset.get('X', VALIDATION_SPLIT), y=self.dataset.get('y', VALIDATION_SPLIT)).squeeze(), TEST_SPLIT: Split(X=self.dataset.get('X', TEST_SPLIT), y=self.dataset.get('y', TEST_SPLIT)).squeeze()
})
[docs]class RandomSplitMixin(SplitMixin): ''' Class to randomly split dataset into different sets ''' def __init__(self, train_size, test_size=None, validation_size=0.0, random_state=123, shuffle=True, **kwargs): ''' Set splitting params: By default validation is 0.0 because it is only used for hyperparameter tuning ''' super(RandomSplitMixin, self).__init__(**kwargs) if train_size is None: train_size = 1.0 - validation_size if test_size is None: test_size = 1.0 - train_size - validation_size # Pipeline Params self.config.update({ 'train_size': train_size, 'validation_size': validation_size, 'test_size': test_size, 'random_state': random_state, 'shuffle': shuffle })
[docs] def split_dataset(self): ''' Overwrite method to split by percentage ''' train_size = self.config.get('train_size') validation_size = self.config.get('validation_size') test_size = self.config.get('test_size') random_state = self.config.get('random_state') shuffle = self.config.get('shuffle') # Sklearn's train test split can only accomodate one split per iteration if test_size == 0: # No split necessary X_remaining, y_remaining = self.dataset.X, self.dataset.y X_test, y_test = [], [] else: X_remaining, X_test, y_remaining, y_test = train_test_split( self.dataset.X, self.dataset.y, test_size=test_size, random_state=random_state, shuffle=shuffle) calibrated_validation_size = float(validation_size) / (validation_size + train_size) if calibrated_validation_size == 0: # No split necessary X_train, y_train = X_remaining, y_remaining X_val, y_val = [], [] else: X_train, X_val, y_train, y_val = train_test_split( X_remaining, y_remaining, test_size=calibrated_validation_size, random_state=random_state, shuffle=shuffle) self._dataset_splits = self.containerize_split({ TRAIN_SPLIT: Split(X=X_train, y=y_train).squeeze(), VALIDATION_SPLIT: Split(X=X_val, y=y_val).squeeze(), TEST_SPLIT: Split(X=X_test, y=y_test).squeeze()
})
[docs]class ChronologicalSplitMixin(SplitMixin): def __init__(self, **kwargs): super(ChronologicalSplitMixin, self).__init__(**kwargs)
[docs]class KFoldSplitMixin(SplitMixin): ''' TBD on how to implement this. KFold requires K models and unique datasets so may be easier to wrap a parallelized implementation that internally creates K new Pipeline and Model objects ''' pass