Source code for simpleml.datasets.pandas.pipeline

"""
Pipeline derived datasets
"""

[docs]__author__ = "Elisha Yadgaran"
from simpleml.utils.errors import DatasetError from .base import BasePandasDataset
[docs]class PandasPipelineDataset(BasePandasDataset): """ Pandas dataset class that generates the dataframe as the output of the linked pipeline """
[docs] def build_dataframe(self) -> None: """ Transform raw dataset via dataset pipeline for production ready dataset """ if self.pipeline is None: raise DatasetError("Must set pipeline before building dataframe") split_names = self.pipeline.get_split_names() splits = [ self.pipeline.transform(X=None, split=split_name) for split_name in split_names ] merged_splits = [self.merge_split(split) for split in splits] if len(merged_splits) > 1: # Combine multiple splits # Join row wise - drop index in case duplicates exist self.dataframe = self.concatenate_dataframes(merged_splits, split_names) else: self.dataframe = merged_splits[0]