Source code for etl_toolbox.mapping_functions

'''
.. epigraph:: Functions for mapping collections of values
'''

from .cleaning_functions import fingerprint


[docs]def map_labels( labels, fingerprint_map, special_characters='', return_unmapped=False ): """ Maps a list of ``labels`` to new values based on provided ``fingerprint_map`` and returns that mapped list. This is useful for mapping column labels from a dataframe/file to a standard schema, particularly when the labels are inconsistent. The order of ``labels`` will be preserved in the return, and if a label isn't found in ``fingerprint_map``, that label will be ``'-'`` in returned list. Usage: >>> from etl_toolbox.mapping_functions import map_labels >>> labels = [1, '2_A', '2b'] >>> fingerprint_map = {'1': 'one', '2a': 'two_a', 'extrakey': 'value'} >>> map_labels(labels, fingerprint_map) ['one', 'two_a', '-'] :param labels: The list of labels to map. These will be fingerprinted for their lookup in ``fingerprint_map``. :param fingerprint_map: A dictionary of all expected label fingerprints mapped to formatted outputs. :param special_characters: A string of special characters to preserve while fingerprinting the labels. See :func:`cleaning_functions.fingerprint() <etl_toolbox.cleaning_functions.fingerprint>` for details. .. note:: This should include any special characters that appear in the keys of ``fingerprint_map``. :type special_characters: string, optional :param return_unmapped: If this is set to ``True``, this function will return a tuple of the mapped labels and a set of unmapped labels (any value from ``labels`` whose fingerprint was not found in ``fingerprint_map``). Default is ``False``. .. note:: This is useful for tracking unrecognized labels of incoming files in an automated ETL system. :type return_unmapped: boolean, optional :return: Returns a list or, if the ``return_unmapped`` option is ``True``, returns a tuple, with the first element being a list and the second being a set. """ mapped_labels = [] unmapped_labels = set() for x in labels: x_fingerprint = fingerprint(x, special_characters=special_characters) if x_fingerprint in fingerprint_map: x_mapped = fingerprint_map[x_fingerprint] mapped_labels.append(x_mapped) else: mapped_labels.append('-') unmapped_labels.add(x) if return_unmapped: return (mapped_labels, unmapped_labels) return mapped_labels
[docs]def append_count(x): """ A generator function that yields ``x`` with a numbered suffix. """ i = 0 while True: i += 1 yield x + '_' + str(i)
[docs]def rename_duplicate_labels(labels, rename_generator=append_count): """ Maps a list of ``labels`` such that duplicates are renamed according to the ``rename_generator``. The order of ``labels`` is preserved in the return, and if a label isn't a duplicate, its value will be unchanged. Values will NOT be fingerprinted for comparison, so this function is best used after labels have been standardized. Usage: >>> from etl_toolbox.mapping_functions import rename_duplicate_labels >>> labels = ['email', 'email', 'phone', 'name', 'email', 'phone'] >>> rename_duplicate_labels(labels) ['email_1', 'email_2', 'phone_1', 'name', 'email_3', 'phone_2'] :param labels: The list of labels to map. :param rename_generator: A generator function that specifies how to rename duplicate columns. It should take a label name as a positional argument and yield the renamed label. The default ``rename_generator`` appends a count, separated by underscore. Example: >>> def rename_generator(x): ... i = 0 ... while True: ... i += 1 ... yield x + '_' + str(i) >>> r = rename_generator('label') >>> next(r) 'label_1' >>> next(r) 'label_2' :type rename_generator: generator, optional :return: Returns a list. """ # Create dictionary of duplicate labels with initialized rename_generators seen = set() duplicates = {} for x in labels: if x in seen: duplicates[x] = rename_generator(x) else: seen.add(x) # Create a new list with each duplicate label renamed according to its # rename_generator mapped_labels = [] for x in labels: if x in duplicates: x_renamed = next(duplicates[x]) mapped_labels.append(x_renamed) else: mapped_labels.append(x) return mapped_labels