Source code for simpleml.datasets.pandas_mixin

'''
Pandas Module for external dataframes

Inherit and extend for particular patterns. It is a bit of a misnomer to use the
term "dataframe", since there are very few expected attributes and they are by no
means unique to pandas.
'''

[docs]__author__ = 'Elisha Yadgaran'
from simpleml.datasets.abstract_mixin import AbstractDatasetMixin import pandas as pd
[docs]DATAFRAME_SPLIT_COLUMN = 'DATASET_SPLIT'
[docs]class PandasDatasetMixin(AbstractDatasetMixin): ''' "Pandas"esque mixin class with control mechanism for `self.dataframe` of type `dataframe`. Only assumes pandas syntax, not types, so should be compatible with pandas drop-in replacements. In particular: A - type of pd.DataFrame: - query() - columns - drop() - __getitem__() - squeeze() B - any other type: - get() - __getitem__() - squeeze( ''' @property
[docs] def X(self): ''' Return the subset that isn't in the target labels (across all potential splits) ''' return self.get(column='X', split=None)
@property
[docs] def y(self): ''' Return the target label columns ''' return self.get(column='y', split=None)
[docs] def get(self, column, split): ''' Explicitly split validation splits Assumes self.dataframe has a get method to return the dataframe associated with the split Uses self.label_columns to separate x and y columns inside the returned dataframe returns empty dataframe for missing combinations of column & split ''' if column not in ('X', 'y'): raise ValueError('Only support columns: X & y') if isinstance(self.dataframe, pd.DataFrame): if split is None: # Return the full dataset (all splits) df = self.dataframe else: df = self.dataframe.query("{}=='{}'".format(DATAFRAME_SPLIT_COLUMN, split)) if DATAFRAME_SPLIT_COLUMN in df.columns: df.drop(DATAFRAME_SPLIT_COLUMN, inplace=True, axis=1) else: df = self.dataframe.get(split) if df is None: # Make compatible with subscription syntax df = pd.DataFrame() if column == 'y': # Squeeze to reduce dimensionality of return return df[[col for col in self.label_columns if col in df.columns]].squeeze() else: return df[df.columns.difference(self.label_columns)]
[docs] def concatenate_dataframes(self, dataframes, split_names): ''' Helper method to merge dataframes into a single one with the split specified under `DATAFRAME_SPLIT_COLUMN` ''' for df, name in zip(dataframes, split_names): df[DATAFRAME_SPLIT_COLUMN] = name # Join row wise - drop index in case duplicates exist return pd.concat(dataframes, axis=0, ignore_index=True)
[docs] def get_feature_names(self): ''' Should return a list of the features in the dataset ''' return self.X.columns.tolist()
@staticmethod
[docs] def load_csv(filename, **kwargs): '''Helper method to read in a csv file''' return pd.read_csv(filename, **kwargs)