Source code for simpleml.persistables.hashing

'''
Mixin classes to handle hashing
'''

__author__ = 'Elisha Yadgaran'


import pandas as pd
import numpy as np
from pandas.util import hash_pandas_object
import inspect


[docs]class CustomHasherMixin(object): ''' Mixin class to hash any object '''
[docs] def custom_hasher(self, object_to_hash, custom_class_proxy=type(object.__dict__)): """ Adapted from: https://stackoverflow.com/questions/5884066/hashing-a-dictionary Makes a hash from a dictionary, list, tuple or set to any level, that contains only other hashable types (including any lists, tuples, sets, and dictionaries). In the case where other kinds of objects (like classes) need to be hashed, pass in a collection of object attributes that are pertinent. For example, a class can be hashed in this fashion: custom_hasher([cls.__dict__, cls.__name__]) A function can be hashed like so: custom_hasher([fn.__dict__, fn.__code__]) python 3.3+ changes the default hash method to add an additional random seed. Need to set the global PYTHONHASHSEED=0 or use a different hash function """ if type(object_to_hash) == custom_class_proxy: o2 = {} for k, v in object_to_hash.items(): if not k.startswith("__"): o2[k] = v object_to_hash = o2 if isinstance(object_to_hash, (set, tuple, list)): return hash(tuple([self.custom_hasher(e) for e in object_to_hash])) elif isinstance(object_to_hash, np.ndarray): return self.custom_hasher(object_to_hash.tostring()) elif isinstance(object_to_hash, pd.DataFrame): # Pandas is unable to hash numpy arrays so prehash those return hash_pandas_object(object_to_hash.applymap( lambda element: self.custom_hasher(element) if isinstance(element, np.ndarray) else element), index=False).sum() elif isinstance(object_to_hash, pd.Series): # Pandas is unable to hash numpy arrays so prehash those return hash_pandas_object(object_to_hash.apply( lambda element: self.custom_hasher(element) if isinstance(element, np.ndarray) else element), index=False).sum() elif object_to_hash is None: # hash of None is unstable between systems return -12345678987654321 elif isinstance(object_to_hash, dict): return hash(tuple( sorted([self.custom_hasher(item) for item in object_to_hash.items()]) )) elif isinstance(object_to_hash, type(lambda: 0)): # Functions dont hash consistently because of the halting problem # https://stackoverflow.com/questions/33998594/hash-for-lambda-function-in-python # Attempt to use the source code string return self.custom_hasher(inspect.getsource(object_to_hash)) elif isinstance(object_to_hash, type): # Have to keep this at the end of the try list; np.ndarray, # pd.DataFrame/Series, and function are also of <type 'type'> return self.custom_hasher(repr(object_to_hash)) # return self.custom_hasher(inspect.getsource(object_to_hash)) else: return hash(object_to_hash)
""" Copied from joblib library -- no modification, just avoiding unnecessary dependencies https://github.com/joblib/joblib/blob/master/joblib/hashing.py#L246 Fast cryptographic hash of Python objects, with a special case for fast hashing of numpy arrays. """ # Author: Gael Varoquaux <gael dot varoquaux at normalesup dot org> # Copyright (c) 2009 Gael Varoquaux # License: BSD Style, 3 clauses. import pickle import hashlib import sys import types import struct import io import decimal PY3_OR_LATER = sys.version_info[0] >= 3 try: _basestring = basestring _bytes_or_unicode = (str, unicode) except NameError: _basestring = str _bytes_or_unicode = (bytes, str) if PY3_OR_LATER: Pickler = pickle._Pickler else: Pickler = pickle.Pickler class _ConsistentSet(object): """ Class used to ensure the hash of Sets is preserved whatever the order of its items. """ def __init__(self, set_sequence): # Forces order of elements in set to ensure consistent hash. try: # Trying first to order the set assuming the type of elements is # consistent and orderable. # This fails on python 3 when elements are unorderable # but we keep it in a try as it's faster. self._sequence = sorted(set_sequence) except (TypeError, decimal.InvalidOperation): # If elements are unorderable, sorting them using their hash. # This is slower but works in any case. self._sequence = sorted((hash(e) for e in set_sequence)) class _MyHash(object): """ Class used to hash objects that won't normally pickle """ def __init__(self, *args): self.args = args
[docs]class Hasher(Pickler): """ A subclass of pickler, to do cryptographic hashing, rather than pickling. """ def __init__(self, hash_name='md5'): self.stream = io.BytesIO() # By default we want a pickle protocol that only changes with # the major python version and not the minor one protocol = (pickle.DEFAULT_PROTOCOL if PY3_OR_LATER else pickle.HIGHEST_PROTOCOL) Pickler.__init__(self, self.stream, protocol=protocol) # Initialise the hash obj self._hash = hashlib.new(hash_name)
[docs] def hash(self, obj, return_digest=True): try: self.dump(obj) except pickle.PicklingError as e: e.args += ('PicklingError while hashing %r: %r' % (obj, e),) raise dumps = self.stream.getvalue() self._hash.update(dumps) if return_digest: return self._hash.hexdigest()
[docs] def save(self, obj): if isinstance(obj, (types.MethodType, type({}.pop))): # the Pickler cannot pickle instance methods; here we decompose # them into components that make them uniquely identifiable if hasattr(obj, '__func__'): func_name = obj.__func__.__name__ else: func_name = obj.__name__ inst = obj.__self__ if type(inst) == type(pickle): obj = _MyHash(func_name, inst.__name__) elif inst is None: # type(None) or type(module) do not pickle obj = _MyHash(func_name, inst) else: cls = obj.__self__.__class__ obj = _MyHash(func_name, inst, cls) Pickler.save(self, obj)
[docs] def memoize(self, obj): # We want hashing to be sensitive to value instead of reference. # For example we want ['aa', 'aa'] and ['aa', 'aaZ'[:2]] # to hash to the same value and that's why we disable memoization # for strings if isinstance(obj, _bytes_or_unicode): return Pickler.memoize(self, obj)
# The dispatch table of the pickler is not accessible in Python # 3, as these lines are only bugware for IPython, we skip them.
[docs] def save_global(self, obj, name=None, pack=struct.pack): # We have to override this method in order to deal with objects # defined interactively in IPython that are not injected in # __main__ kwargs = dict(name=name, pack=pack) if sys.version_info >= (3, 4): del kwargs['pack'] try: Pickler.save_global(self, obj, **kwargs) except pickle.PicklingError: Pickler.save_global(self, obj, **kwargs) module = getattr(obj, "__module__", None) if module == '__main__': my_name = name if my_name is None: my_name = obj.__name__ mod = sys.modules[module] if not hasattr(mod, my_name): # IPython doesn't inject the variables define # interactively in __main__ setattr(mod, my_name, obj)
dispatch = Pickler.dispatch.copy() # builtin dispatch[type(len)] = save_global # type dispatch[type(object)] = save_global # classobj dispatch[type(Pickler)] = save_global # function dispatch[type(pickle.dump)] = save_global def _batch_setitems(self, items): # forces order of keys in dict to ensure consistent hash. try: # Trying first to compare dict assuming the type of keys is # consistent and orderable. # This fails on python 3 when keys are unorderable # but we keep it in a try as it's faster. Pickler._batch_setitems(self, iter(sorted(items))) except TypeError: # If keys are unorderable, sorting them using their hash. This is # slower but works in any case. Pickler._batch_setitems(self, iter(sorted((hash(k), v) for k, v in items)))
[docs] def save_set(self, set_items): # forces order of items in Set to ensure consistent hash Pickler.save(self, _ConsistentSet(set_items))
dispatch[type(set())] = save_set
[docs]class NumpyHasher(Hasher): """ Special case the hasher for when numpy is loaded. """ def __init__(self, hash_name='md5', coerce_mmap=False): """ Parameters ---------- hash_name: string The hash algorithm to be used coerce_mmap: boolean Make no difference between np.memmap and np.ndarray objects. """ self.coerce_mmap = coerce_mmap Hasher.__init__(self, hash_name=hash_name) # delayed import of numpy, to avoid tight coupling import numpy as np self.np = np if hasattr(np, 'getbuffer'): self._getbuffer = np.getbuffer else: self._getbuffer = memoryview
[docs] def save(self, obj): """ Subclass the save method, to hash ndarray subclass, rather than pickling them. Off course, this is a total abuse of the Pickler class. """ if isinstance(obj, self.np.ndarray) and not obj.dtype.hasobject: # Compute a hash of the object # The update function of the hash requires a c_contiguous buffer. if obj.shape == (): # 0d arrays need to be flattened because viewing them as bytes # raises a ValueError exception. obj_c_contiguous = obj.flatten() elif obj.flags.c_contiguous: obj_c_contiguous = obj elif obj.flags.f_contiguous: obj_c_contiguous = obj.T else: # Cater for non-single-segment arrays: this creates a # copy, and thus aleviates this issue. # XXX: There might be a more efficient way of doing this obj_c_contiguous = obj.flatten() # memoryview is not supported for some dtypes, e.g. datetime64, see # https://github.com/numpy/numpy/issues/4983. The # workaround is to view the array as bytes before # taking the memoryview. self._hash.update( self._getbuffer(obj_c_contiguous.view(self.np.uint8))) # We store the class, to be able to distinguish between # Objects with the same binary content, but different # classes. if self.coerce_mmap and isinstance(obj, self.np.memmap): # We don't make the difference between memmap and # normal ndarrays, to be able to reload previously # computed results with memmap. klass = self.np.ndarray else: klass = obj.__class__ # We also return the dtype and the shape, to distinguish # different views on the same data with different dtypes. # The object will be pickled by the pickler hashed at the end. obj = (klass, ('HASHED', obj.dtype, obj.shape, obj.strides)) elif isinstance(obj, self.np.dtype): # Atomic dtype objects are interned by their default constructor: # np.dtype('f8') is np.dtype('f8') # This interning is not maintained by a # pickle.loads + pickle.dumps cycle, because __reduce__ # uses copy=True in the dtype constructor. This # non-deterministic behavior causes the internal memoizer # of the hasher to generate different hash values # depending on the history of the dtype object. # To prevent the hash from being sensitive to this, we use # .descr which is a full (and never interned) description of # the array dtype according to the numpy doc. klass = obj.__class__ obj = (klass, ('HASHED', obj.descr)) Hasher.save(self, obj)
[docs]def hash(obj, hash_name='md5', coerce_mmap=False): """ Quick calculation of a hash to identify uniquely Python objects containing numpy arrays. Parameters ----------- hash_name: 'md5' or 'sha1' Hashing algorithm used. sha1 is supposedly safer, but md5 is faster. coerce_mmap: boolean Make no difference between np.memmap and np.ndarray """ if 'numpy' in sys.modules: hasher = NumpyHasher(hash_name=hash_name, coerce_mmap=coerce_mmap) else: hasher = Hasher(hash_name=hash_name) return hasher.hash(obj)