simpleml.save_patterns.serializers.dask

Module for Dask save patterns

Module Contents

Classes

DaskCSVSerializer

DaskHDFSerializer

DaskJSONSerializer

DaskORCSerializer

DaskParquetSerializer

DaskPersistenceMethods

Base class for internal dask serialization/deserialization options

Attributes

__author__

simpleml.save_patterns.serializers.dask.__author__ = Elisha Yadgaran[source]
class simpleml.save_patterns.serializers.dask.DaskCSVSerializer[source]

Bases: simpleml.save_patterns.base.BaseSerializer

static deserialize(filepaths, source_directory='system_temp', **kwargs)[source]
Parameters
  • filepaths (List[str]) –

  • source_directory (str) –

Return type

Dict[str, Any]

static serialize(obj, filepath, format_directory=CSV_DIRECTORY, format_extension='.csv', destination_directory='system_temp', **kwargs)[source]
Parameters
  • obj (simpleml.imports.ddDataFrame) –

  • filepath (str) –

  • format_directory (str) –

  • format_extension (str) –

  • destination_directory (str) –

Return type

Dict[str, str]

class simpleml.save_patterns.serializers.dask.DaskHDFSerializer[source]

Bases: simpleml.save_patterns.base.BaseSerializer

static deserialize(filepath, source_directory='system_temp', **kwargs)[source]
Parameters
  • filepath (str) –

  • source_directory (str) –

Return type

Dict[str, Any]

static serialize(obj, filepath, format_directory=HDF5_DIRECTORY, format_extension='.hdf', destination_directory='system_temp', **kwargs)[source]
Parameters
  • obj (simpleml.imports.ddDataFrame) –

  • filepath (str) –

  • format_directory (str) –

  • format_extension (str) –

  • destination_directory (str) –

Return type

Dict[str, str]

class simpleml.save_patterns.serializers.dask.DaskJSONSerializer[source]

Bases: simpleml.save_patterns.base.BaseSerializer

static deserialize(filepaths, source_directory='system_temp', **kwargs)[source]
Parameters
  • filepaths (List[str]) –

  • source_directory (str) –

Return type

Dict[str, Any]

static serialize(obj, filepath, format_directory=JSON_DIRECTORY, format_extension='.jsonl', destination_directory='system_temp', **kwargs)[source]
Parameters
  • obj (simpleml.imports.ddDataFrame) –

  • filepath (str) –

  • format_directory (str) –

  • format_extension (str) –

  • destination_directory (str) –

Return type

Dict[str, str]

class simpleml.save_patterns.serializers.dask.DaskORCSerializer[source]

Bases: simpleml.save_patterns.base.BaseSerializer

static deserialize(filepath, source_directory='system_temp', **kwargs)[source]
Parameters
  • filepath (str) –

  • source_directory (str) –

Return type

Dict[str, Any]

static serialize(obj, filepath, format_directory=ORC_DIRECTORY, format_extension='.orc', destination_directory='system_temp', **kwargs)[source]
Parameters
  • obj (simpleml.imports.ddDataFrame) –

  • filepath (str) –

  • format_directory (str) –

  • format_extension (str) –

  • destination_directory (str) –

Return type

Dict[str, str]

class simpleml.save_patterns.serializers.dask.DaskParquetSerializer[source]

Bases: simpleml.save_patterns.base.BaseSerializer

static deserialize(filepath, source_directory='system_temp', **kwargs)[source]
Parameters
  • filepath (str) –

  • source_directory (str) –

Return type

Dict[str, Any]

static serialize(obj, filepath, format_directory=PARQUET_DIRECTORY, format_extension='.parquet', destination_directory='system_temp', **kwargs)[source]
Parameters
  • obj (simpleml.imports.ddDataFrame) –

  • filepath (str) –

  • format_directory (str) –

  • format_extension (str) –

  • destination_directory (str) –

Return type

Dict[str, str]

class simpleml.save_patterns.serializers.dask.DaskPersistenceMethods[source]

Bases: object

Base class for internal dask serialization/deserialization options

Wraps dd.Dataframe methods with sensible defaults Uses dask bag alternatives for optimizations (notably for read parallelization and memory handling)

INDEX_COLUMN = simpleml_index[source]
classmethod read_csv(cls, filepaths, sample_rows=1000, **kwargs)[source]
Parameters
  • filepaths (List[str]) –

  • sample_rows (int) –

Return type

simpleml.imports.ddDataFrame

static read_fwf(**kwargs)[source]
Return type

simpleml.imports.ddDataFrame

static read_hdf(filepath, **kwargs)[source]
Parameters

filepath (str) –

Return type

simpleml.imports.ddDataFrame

classmethod read_json(cls, filepaths, persist=False, **kwargs)[source]

Uses dask bag implementation to optimize read :param persist: bool, flag to return a processing future instead of lazy compute later

Parameters

filepaths (List[str]) –

Return type

simpleml.imports.ddDataFrame

static read_orc(filepath, **kwargs)[source]
Parameters

filepath (str) –

Return type

simpleml.imports.ddDataFrame

static read_parquet(filepath, **kwargs)[source]
Parameters

filepath (str) –

Return type

simpleml.imports.ddDataFrame

static read_sql_table(**kwargs)[source]
Return type

simpleml.imports.ddDataFrame

static read_table(**kwargs)[source]
Return type

simpleml.imports.ddDataFrame

static read_text(*args, **kwargs)[source]

Dask Bag wrapper to read text and return a bag

Return type

simpleml.imports.dbBag

classmethod to_csv(cls, df, filepath, overwrite=True, **kwargs)[source]
Parameters
  • df (simpleml.imports.ddDataFrame) –

  • filepath (str) –

  • overwrite (bool) –

Return type

None

static to_hdf(df, filepath, overwrite=True, **kwargs)[source]
Parameters
  • df (simpleml.imports.ddDataFrame) –

  • filepath (str) –

  • overwrite (bool) –

Return type

None

classmethod to_json(cls, df, filepath, overwrite=True, **kwargs)[source]
Parameters
  • df (simpleml.imports.ddDataFrame) –

  • filepath (str) –

  • overwrite (bool) –

Return type

None

static to_orc(df, filepath, overwrite=True, **kwargs)[source]
Parameters
  • df (simpleml.imports.ddDataFrame) –

  • filepath (str) –

  • overwrite (bool) –

Return type

None

static to_parquet(df, filepath, overwrite=True, **kwargs)[source]
Parameters
  • df (simpleml.imports.ddDataFrame) –

  • filepath (str) –

  • overwrite (bool) –

Return type

None

static to_sql(df, **kwargs)[source]
Parameters

df (simpleml.imports.ddDataFrame) –

Return type

None