Source code for dicthash

"""
dicthash.dicthash
=============

A module implementing an md5 hash function for (nested) dictionaries.

Functions
---------

generate_hash_from_dict - generate an md5 hash from a (nested)
dictionary

"""
import hashlib
import warnings

from typing import overload, List, Union, Iterable, Optional, Hashable

FLOAT_FACTOR = 1e15
FLOOR_SMALL_FLOATS = False

# user warnings are printed to sys.stdout
warnings.simplefilter('default', category=UserWarning)


def _save_convert_float_to_int(x: float) -> int:
    """
    Convert a float x to an integer. Avoid rounding errors on different
    platforms by shifting the floating point behind the last relevant
    digit.

    Parameters
    ----------
    x : float
        Float to be converted.
    """
    if abs(x) > 0. and abs(x) < 1. / FLOAT_FACTOR:
        if not FLOOR_SMALL_FLOATS:
            raise ValueError('Float too small for safe conversion to '
                             'integer.')
        else:
            x = 0.
            warnings.warn('Float too small for safe conversion to'
                          'integer. Rounding down to zero.', UserWarning)
    return int(x * FLOAT_FACTOR)


def _unpack_value(value: Union[dict, Iterable, float, int],
                  prefix: str = '', whitelist: Optional[List[Hashable]] = None,
                  blacklist: Optional[List[Hashable]] = None) -> str:
    """
    Unpack values from a data structure and convert to string. Call
    the corresponding functions for dict or iterables or use simple
    string conversion for scalar variables.

    Parameters
    ----------
    value : Union[dict, Iterable, float, int]
        Value to be unpacked.
    prefix : str, optional
        Prefix to preprend to resulting string. Defaults to empty
        string.

    Returns
    -------
    str
        Unpacked values.
    """

    if isinstance(value, dict):
        return _generate_string_from_dict(value,
                                          blacklist=blacklist,
                                          whitelist=whitelist,
                                          prefix=prefix + 'd')
    else:
        # not a dict
        if isinstance(value, Iterable):
            return prefix + _generate_string_from_iterable(value, prefix='i')
        else:
            # not an iterable
            if isinstance(value, float):
                return prefix + str(_save_convert_float_to_int(value))
            else:
                return prefix + str(value)


def _generate_string_from_iterable(l: Iterable, prefix: str = '') -> str:
    """
    Convert an iterable to a string, by extracting every value. Takes
    care of proper handling of floats to avoid rounding errors.

    Parameters
    ----------
    l : Iterable
        Iterable to be converted.
    """

    # we need to handle strings separately to avoid infinite recursion
    # due to their iterable property
    if isinstance(l, str):
        return ''.join((prefix, str(l)))
    else:
        return ''.join(_unpack_value(value, prefix='') for value in l)


def _generate_string_from_dict(d: dict, blacklist: Optional[List[Hashable]] = None,
                               whitelist: Optional[List[Hashable]] = None, prefix: str = '') ->str:
    """
    Convert a dictionary to a string, by extracting every key value
    pair. Takes care of proper handling of floats, iterables and nested
    dictionaries.

    Parameters
    ----------
    d : dict
        Dictionary to be converted
    blacklist : List[Hashable], optional
        List of keys to exclude from conversion. Blacklist overrules
        whitelist, i.e., keys appearing in the blacklist will
        definitely not be used.
    whitelist: List[Hashable], optional
        List of keys to include in conversion.
    """
    if whitelist is None:
        whitelist = list(d.keys())
    if blacklist is not None:
        whitelist = list(set(whitelist).difference(blacklist))
    # Sort whitelist according to the keys converted to str
    if len(whitelist) > 0:
        return ''.join(_unpack_value(d[key],
                                     whitelist=filter_blackwhitelist(whitelist, key),
                                     blacklist=filter_blackwhitelist(blacklist, key),
                                     prefix=prefix + str(key)) for
                       key in sorted(filter_blackwhitelist(whitelist, None), key=str))
    else:
        return ''


[docs]def generate_hash_from_dict(d: dict, blacklist: Optional[List[Hashable]] = None, whitelist: Optional[List[Hashable]] = None, raw: bool = False) -> str: """ Generate an md5 hash from a (nested) dictionary. Takes care of extracting nested dictionaries, iterables and avoids rounding errors of floats. Makes sure keys are read in a unique order. A blacklist of keys can be passed, that can contain keys which should be excluded from the hash. If a whitelist is given, only keys appearing in the whitelist are used to generate the hash. All strings are converted to unicode, i.e., the hash does not distinguish between strings provided in ascii or unicode format. Lists, np.ndarrays and tuples are treated equally, i.e., an array-like item [1,2,3], np.array([1,2,3]) or (1,2,3) will lead to the same hash if they are of the same type. Parameters ---------- d : dict Dictionary to compute the hash from. blacklist : List[Hashable], optional List of keys which *are not* used for generating the hash. Keys of subdirectories can be provided by specifying the full path of keys in a tuple. If None, no keys will be ignored. whitelist : List[Hashable], optional List of keys which *are* used for generating the hash. Keys of subdirectories can be provided by specifying the full path of keys in a tuple. If None, all keys will be used. Blacklist overrules whitelist, i.e., keys appearing in the blacklist will definitely not be used. raw : bool, optional if True, return the unhashed string. Returns ------- str The hash generated from the dictionary, or the unhashed string if raw is True. Example ------- >>> from dicthash import generate_hash_from_dict >>> d = {'a': 'asd', 'b': 0.12, 3: {'c': [3, 4, 5]}} >>> generate_hash_from_dict(d) 'd748bbf148db514911ed0bf215729d01' """ if not isinstance(d, dict): raise TypeError('Please provide a dictionary.') if blacklist is not None: validate_blackwhitelist(d, blacklist) if whitelist is not None: validate_blackwhitelist(d, whitelist) raw_string = _generate_string_from_dict(d, blacklist, whitelist, prefix='d') if raw: return raw_string else: return hashlib.md5(raw_string.encode('utf-8')).hexdigest()
def validate_blackwhitelist(d: dict, l: list) -> None: """ Validate that all entries in black/whitelist l, appear in the dictionary d Parameters ---------- d : dict Dictionary to use for validation. l : list Blacklist or whitelist to validate. Returns ------- None """ for key in l: if isinstance(key, tuple): k = key[0] else: k = key if k not in d: raise KeyError('Key "{key}" not found in dictionary. ' 'Invalid black/whitelist.'.format(key=key)) if isinstance(key, tuple) and len(key) > 1: validate_blackwhitelist(d[key[0]], [key[1:]]) @overload def filter_blackwhitelist(l: None, key: Optional[Hashable]) -> None: pass @overload def filter_blackwhitelist(l: list, key: Optional[Hashable]) -> list: pass def filter_blackwhitelist(l: Optional[list], key: Optional[Hashable]) -> Union[list, None]: """ Filter black/whitelist for the keys that belong to the subdirectory which is embedded into the nested dictionary structure with the given key. Three different cases: - if l is None, then return None - if key is None, then we are at the top-level dictionary, thus include all scalar keys and the first element of tuples. - if key is not None, then return only the keys that are tuples where the first element of the tuple matches the given key Parameters ---------- l : list Black- or whitelist to filter key : Hashable, optional Key to filter for. See above for the behavior if key is None """ if l is None: return None else: fl = [] for k in l: if isinstance(k, tuple): if key is not None and k[0] == key: if len(k) == 2: fl.append(k[1]) else: fl.append(k[1:]) elif key is None: fl.append(k[0]) elif key is None: fl.append(k) if len(fl) == 0: return None else: return fl