Source code for simpleml.persistables.base_persistable

from sqlalchemy import MetaData, Column, func, String, Boolean, Integer
from sqlalchemy.dialects.postgresql import JSONB
from simpleml.persistables.meta_registry import MetaRegistry, SIMPLEML_REGISTRY
from simpleml.persistables.guid import GUID
from simpleml.persistables.base_sqlalchemy import BaseSQLAlchemy
from simpleml.persistables.saving import AllSaveMixin
from simpleml.persistables.hashing import CustomHasherMixin
from simpleml.utils.library_versions import INSTALLED_LIBRARIES
import uuid
from abc import abstractmethod
from future.utils import with_metaclass


__author__ = 'Elisha Yadgaran'


[docs]class Persistable(with_metaclass(MetaRegistry, BaseSQLAlchemy, AllSaveMixin, CustomHasherMixin)): ''' Base class for all SimpleML database objects. Defaults to PostgreSQL but can be swapped out for any supported SQLAlchemy backend. Takes advantage of sqlalchemy-mixins to enable active record operations (TableModel.save(), create(), where(), destroy()) ------- Schema ------- id: Random UUID(4). Used over auto incrementing id to minimize collision probability with distributed trainings and authors (especially if using central server to combine results across different instantiations of SimpleML) hash_id: Use hash of object to uniquely identify the contents at train time registered_name: class name of object defined when importing Can be used for the drag and drop GUI - also for prescribing training config author: creator project: Project objects are associated with. Useful if multiple persistables relate to the same project and want to be grouped (but have different names) also good for implementing row based security across teams name: friendly name - primary way of tracking evolution of "same" object over time version: autoincrementing id of "friendly name" version_description: description that explains what is new or different about this version # Persistence of fitted states has_external_files = boolean field to signify presence of saved files not in (main) db filepaths = JSON object with external file details Structure: { "disk": [ path to file, relative to base simpleml folder (default ~/.simpleml), ... ], "database": [ (schema, table_name), (for files extractable with `select * from`) .... ], "pickled": [ guid, (for files in binary blobs) ... ] } metadata: Generic JSON store for random attributes ''' __abstract__ = True # Uses main (public) schema metadata = MetaData() # Use random uuid for graceful distributed instantiation # also allows saved objects to include id in filename (before db persistence) id = Column(GUID, primary_key=True, default=uuid.uuid4) # Specific metadata for versioning and comparison # Use hash for code/data content for referencing similar objects # Use registered name for internal object pointer - internal code can # still get updated between trainings (hence hash) # TODO: figure out how to hash objects in a way that signifies code content hash_ = Column('hash', String, nullable=False) registered_name = Column(String, nullable=False) author = Column(String, default='default', nullable=False) project = Column(String, default='default', nullable=False) name = Column(String, default='default', nullable=False) version = Column(Integer, nullable=False) version_description = Column(String, default='') # Persistence of fitted states has_external_files = Column(Boolean, default=False) filepaths = Column(JSONB, default={}) # Generic store and metadata for all child objects metadata_ = Column('metadata', JSONB, default={}) def __init__(self, name=None, has_external_files=False, author=None, project=None, version_description=None, save_method='disk_pickled', **kwargs): # Initialize values expected to exist at time of instantiation self.registered_name = self.__class__.__name__ self.id = uuid.uuid4() self.author = author self.project = project self.name = name self.has_external_files = has_external_files self.version_description = version_description # Special place for SimpleML internal params # Think of as the config to initialize objects self.metadata_ = {} # Place for any arbitrary metadata self.metadata_['config'] = {} # Place for parameters that uniquely configure an instance on initialization self.metadata_['state'] = {} # Place for transitory values that may be set post initialization (and want to be persisted) # For external loading - initialize to None self.unloaded_externals = None # Store save method in state metadata as an operational setting, otherwise # it could affect the hash and result in a different object per save location self.state['save_method'] = save_method @property def config(self): return self.metadata_['config'] @property def state(self): return self.metadata_['state'] @abstractmethod def _hash(self): ''' Each subclass should implement a hashing routine to uniquely AND consistently identify the object contents. Consistency is important to ensure ability to assert identity across code definitions ''' def _get_latest_version(self): ''' Versions should be autoincrementing for each object (constrained over friendly name). Executes a database lookup and increments.. ''' last_version = self.__class__.query_by( func.max(self.__class__.version) ).filter( self.__class__.name == self.name ).scalar() if last_version is None: last_version = 0 return last_version + 1
[docs] def save(self): ''' Each subclass needs to instantiate a save routine to persist to the database and any other required filestore sqlalchemy_mixins supports active record style TableModel.save() so can still call super(Persistable, self).save() ''' if self.has_external_files: self._save_external_files() # Hash contents upon save self.hash_ = self._hash() # Get the latest version for this "friendly name" self.version = self._get_latest_version() # Store library versions in case of future loads into unsupported environments self.metadata_['library_versions'] = INSTALLED_LIBRARIES super(Persistable, self).save()
[docs] def load(self, load_externals=True): ''' Counter operation for save Needs to load any file and db objects Class definition is stored by registered_name param and Pickled objects are stored in external_filename param :param load_externals: Boolean flag whether to load the external files useful for relationships that only need class definitions and not data ''' # Lookup appropriate class and reinstantiate self.__class__ = self._load_class() if self.has_external_files and load_externals: self._load_external_files() if self.has_external_files and not load_externals: self.unloaded_externals = True else: self.unloaded_externals = False
def _load_class(self): ''' Wrapper function to call global registry of all imported class names ''' return SIMPLEML_REGISTRY.get(self.registered_name)