Source code for etl_toolbox.dataframe_functions

'''
.. epigraph:: Functions for working with :class:`pandas.DataFrame`\\ s
'''

import numpy as np
import pandas as pd

from .cleaning_functions import clean_null, fingerprint


[docs]def find_column_labels(
    df, label_fingerprints, label_match_thresh=3, special_characters=''
):
    """
    Finds a row of column labels within a :class:`pandas.DataFrame` based on
    a collection of expected ``label_fingerprints``.

    ``df.columns`` is updated to contain the found column labels and all rows
    before and including the column label row are removed. If the initial
    column labels of ``df`` fit the match criteria, ``df`` will be unchanged.

    .. note::
       All labels must be in the same row. This function will not attempt to
       correct translational errors.

    Usage:
      >>> import pandas as pd
      >>> from etl_toolbox.dataframe_functions import find_column_labels
      >>> df = pd.DataFrame(
      ...     [
      ...         ["created by:", "etl-toolbox",             "",         "-"],
      ...         [ "2020-06-07",         "---",       "3 rows", "4 columns"],
      ...         [           "",            "",             "",         "-"],
      ...         [      "email",        "date",        "phone",        "id"],
      ...         ["aaa@aaa.com",     "04mar14", "999-333-4444",       "AAA"],
      ...         ["baa@baa.com",     "05aug13", "111-222-3333",       "BAA"],
      ...         ["caa@caa.com",     "01jun15", "777-777-7777",       "CAA"]
      ...     ]
      ... )
      >>> print(df)
                   0            1             2          3
      0  created by:  etl-toolbox                        -
      1   2020-06-07          ---        3 rows  4 columns
      2                                                  -
      3        email         date         phone         id
      4  aaa@aaa.com      04mar14  999-333-4444        AAA
      5  baa@baa.com      05aug13  111-222-3333        BAA
      6  caa@caa.com      01jun15  777-777-7777        CAA
      >>> label_fingerprints = {'email', 'date', 'phone'}
      >>> find_column_labels(df, label_fingerprints)
      >>> print(df)
               email     date         phone   id
      0  aaa@aaa.com  04mar14  999-333-4444  AAA
      1  baa@baa.com  05aug13  111-222-3333  BAA
      2  caa@caa.com  01jun15  777-777-7777  CAA

    :param df:
        A :class:`pandas.DataFrame` containing column labels as a row, possibly
        preceded by misc. non-data rows.

    :param label_fingerprints:
        Fingerprinted label names that are expected in the column labels row.
        It can contain many variations on the expected label names, which
        makes this function useful for cleaning multiple files with varied
        column labels.

        Example:
          >>> label_fingerprints = {'email', 'emailaddr', 'phone', 'phonenum'}

        .. note::
           If a `dict` is provided, the keys will be used as the fingerprints
           and the values will be ignored.

        A ``fingerprint_map`` dictionary created for use in the
        :mod:`~etl_toolbox.mapping_functions` module can be passed as
        ``label_fingerprints``.

    :type label_fingerprints: set, list, or dict

    :param label_match_thresh:
        The number of fingerprints that must be found in
        ``label_fingerprints`` for a row to be identified as the label row.
        Default is ``3``.

        This exists to prevent false matches caused by non-data header rows
        containing values that are also expected column labels.

        For example, in the following :class:`~pandas.DataFrame`, row 1 will be
        misidentified as the label row if the ``label_match_thresh`` is set
        to ``1``:

          >>> import pandas as pd
          >>> from etl_toolbox.dataframe_functions import find_column_labels
          >>> df = pd.DataFrame(
          ...     [
          ...         [      "name:", "data report",             "",         "-"],
          ...         [      "date:",  "2001-05-03",       "3 rows", "4 columns"],
          ...         [           "",            "",             "",         "-"],
          ...         [      "email",        "date",        "phone",        "id"],
          ...         ["aaa@aaa.com",     "04mar14", "999-333-4444",       "AAA"],
          ...         ["baa@baa.com",     "05aug13", "111-222-3333",       "BAA"],
          ...         ["caa@caa.com",     "01jun15", "777-777-7777",       "CAA"]
          ...     ]
          ... )
          >>> print(df)
                       0            1             2          3
          0        name:  data report                        -
          1        date:   2001-05-03        3 rows  4 columns
          2                                                  -
          3        email         date         phone         id
          4  aaa@aaa.com      04mar14  999-333-4444        AAA
          5  baa@baa.com      05aug13  111-222-3333        BAA
          6  caa@caa.com      01jun15  777-777-7777        CAA
          >>> label_fingerprints = {'email', 'date', 'phone'}
          >>> find_column_labels(df, label_fingerprints, label_match_thresh=1)
          >>> print(df)
                   date: 2001-05-03        3 rows 4 columns
          0                                               -
          1        email       date         phone        id
          2  aaa@aaa.com    04mar14  999-333-4444       AAA
          3  baa@baa.com    05aug13  111-222-3333       BAA
          4  caa@caa.com    01jun15  777-777-7777       CAA

        ``3`` is a good value for most datasets with a known set of
        ``label_fingerprints``. It can be set lower if the incoming data has
        few columns and/or highly varied label names.

        If this argument is set to ``0``, the function will raise a
        :exc:`ValueError`.

    :type label_match_thresh: int, optional

    :param special_characters:
        (optional) A string of special characters to preserve while creating
        the fingerprints for lookup in ``label_fingerprints``. See
        :func:`cleaning_functions.fingerprint()
        <etl_toolbox.cleaning_functions.fingerprint>` for details.

        .. note::
           Any special characters that appear in the elements of
           ``label_fingerprints`` should be included here.

    :type special_characters: string, optional

    :raises IndexError:
        Raised if a label row can not be identified in the given
        :class:`~pandas.DataFrame`.

    :raises ValueError:
        Raised if the ``label_match_thresh`` is set to `0`.

    :return:
        Returns ``None``. The ``df`` argument is mutated.
    """
    if label_match_thresh == 0:
        raise ValueError("label_match_thresh can not be 0.")

    # First, check if the initial labels are already correct. If they are,
    # exit without changing df.

    label_count = 0

    for cell in df.columns:
        cell_fingerprint = fingerprint(
            cell, special_characters=special_characters
        )

        if cell_fingerprint in label_fingerprints:
            label_count += 1

    
    if label_count >= label_match_thresh:
        return None

    # Check whether the index is something other than the default RangeIndex,
    # and temporarily change it to a RangeIndex if it is.
    #
    # This is necessary because pandas doesn't provide a way to drop rows in
    # place using integer locations, however using the named index will produce
    # unexpected results if the DataFrame has duplicate index values.
    initial_index = df.index
    initial_index_is_default = index_is_default(df)

    if not initial_index_is_default:
        df.reset_index(drop=True, inplace=True)

    # Iterate over rows to find the label index
    label_index = None

    for i, row in df.iterrows():
        # Count the number of fingerprinted cell values found in
        # label_fingerprints for this row
        label_count = 0

        for cell in row.iteritems():
            cell_fingerprint = fingerprint(
                cell[1], special_characters=special_characters
            )

            if cell_fingerprint in label_fingerprints:
                label_count += 1

        # When label row is found, record the index and break
        if label_count >= label_match_thresh:
            label_index = i
            break

    if label_index is None:
        raise IndexError(
            'Label row could not be identified. Make sure '
            'label_fingerprints contains the expected label names.'
        )

    # Set DataFrame column labels
    df.rename(columns=df.loc[label_index], inplace=True)

    # Remove rows up to and including the label index
    df.drop(df.loc[:label_index].index, inplace=True)

    # If the initial index was a default RangeIndex, reset it so it is
    # numbered from 0.
    # Otherwise, restore the intial index, sliced to line up with the
    # modified DataFrame.
    if initial_index_is_default:
        df.reset_index(drop=True, inplace=True)
    else:
        df.index = initial_index[label_index + 1:]


[docs]def merge_columns_by_label(df, deduplicate_values=False):
    """
    Merges columns of a :class:`pandas.DataFrame` that have identical labels

    For duplicate column labels in ``df``, the first instance of each label
    will be turned into a column of lists containing the values from all of
    the instances. The other instances will then be dropped.

    ``None`` and ``np.nan`` values will not be included in the merged column.

    .. note::
       This function does not fingerprint the labels for comparison. Column
       labels should be cleaned and mapped before using this tool.

    Usage:
      >>> import pandas as pd
      >>> from etl_toolbox.dataframe_functions import merge_columns_by_label
      >>> df = pd.DataFrame(
      ...     [
      ...         ["AAA", "aaa@aaa.com", "111-111-1111", "111@aaa.com"],
      ...         ["BAA", "baa@baa.com", "222-222-2222", "222@baa.com"],
      ...         ["CAA", "caa@caa.com", "333-333-3333", None],
      ...         ["DAA", "daa@daa.com", "444-444-4444", "444@daa.com"]
      ...     ],
      ...     columns=["id", "email", "phone", "email"]
      ... )
      >>> print(df)
          id        email         phone        email
      0  AAA  aaa@aaa.com  111-111-1111  111@aaa.com
      1  BAA  baa@baa.com  222-222-2222  222@baa.com
      2  CAA  caa@caa.com  333-333-3333         None
      3  DAA  daa@daa.com  444-444-4444  444@daa.com
      >>> merge_columns_by_label(df)
      >>> print(df)
          id                       email         phone
      0  AAA  [aaa@aaa.com, 111@aaa.com]  111-111-1111
      1  BAA  [baa@baa.com, 222@baa.com]  222-222-2222
      2  CAA               [caa@caa.com]  333-333-3333
      3  DAA  [daa@daa.com, 444@daa.com]  444-444-4444

    :param df:
        A :class:`pandas.DataFrame`.

    :param deduplicate_values:
        If ``True``, the values of the combined columns will be deduplicated
        and stored in the modified :class:`~pandas.DataFrame` as a `set`
        instead of a `list`.

    :type deduplicate_values: boolean, optional

    :return:
        Returns ``None``. The ``df`` argument is mutated.
    """

    # Get the set of duplicate column labels
    duplicate_labels = set(df.columns[df.columns.duplicated()])

    for label in duplicate_labels:
        # Get integer location of first instance of label
        first_iloc = next(i for i, x in enumerate(df.columns) if x == label)

        # Make a new column that combines the values for all instances of label
        temp_column = []

        for cell in df[label].values:
            cell = [x for x in cell if not pd.isnull(x)]  # Remove None and nan

            if deduplicate_values:
                temp_column.append(set(cell))
            else:
                temp_column.append(cell)

        df.insert(first_iloc, 'temp', temp_column)

        # Drop the duplicate columns
        temp_labels = df.columns.tolist()
        temp_labels[first_iloc] = 'temp'

        if label is None:
            # drop() can't take None as a column name, so this is a workaround.
            # As long df doesn't contain this exact label name, it will work
            # as expected.
            placeholder_label = '**be15492e-d01e-4ab9-9f2c-116525a26897**-label'
            df.columns = [
                placeholder_label if x is None else x for x in df.columns
            ]
            df.drop(columns=placeholder_label, inplace=True)
        else:
            df.drop(columns=label, inplace=True)

        # Name the new column back to the original label
        temp_labels = df.columns.tolist()
        temp_labels[first_iloc] = label
        df.columns = temp_labels


[docs]def index_is_default(df):
    """
    Returns ``True`` if the provided :class:`~pandas.DataFrame` has the
    default :class:`pandas.RangeIndex`.

    Else returns ``False``.
    """

    return df.index.equals(pd.RangeIndex(df.shape[0])) and df.index.name is None


[docs]def dataframe_clean_null(
    df,
    empty_row_thresh=1,
    empty_column_thresh=1,
    falsey_is_null=False,
    special_characters='',
):
    """
    Cleans null values of a :class:`pandas.DataFrame` and removes empty
    rows/columns.

    .. warning::
       This function is computationally intensive and might be slow on large
       :class:`~pandas.DataFrame`\\ s.

    Usage:
      >>> import pandas as pd
      >>> from etl_toolbox.dataframe_functions import dataframe_clean_null
      >>> df = pd.DataFrame(
      ...     [
      ...         ["AAA",          None, "111-111-1111", "empty"],
      ...         ["BAA", "baa@baa.com",            "-",     "-"],
      ...         ["CAA", "caa@caa.com", "notavailable",   "..."],
      ...         ["DAA",     "blocked", "444-444-4444",  "null"]
      ...     ],
      ...     columns=["id", "email", "phone", "col4"]
      ... )
      >>> print(df)
          id        email         phone   col4
      0  AAA         None  111-111-1111  empty
      1  BAA  baa@baa.com             -      -
      2  CAA  caa@caa.com  notavailable    ...
      3  DAA      blocked  444-444-4444   null
      >>> dataframe_clean_null(df)
      >>> print(df)
          id        email         phone
      0  AAA          NaN  111-111-1111
      1  BAA  baa@baa.com           NaN
      2  CAA  caa@caa.com           NaN
      3  DAA          NaN  444-444-4444

    :param df:
        A :class:`pandas.DataFrame`.

    :param empty_row_thresh:
        The number of non-null values required for a row to be considered
        populated/non-empty. Default is ``1``.

        If set to ``0``, no rows will be removed.

    :type empty_row_thresh: int, optional

    :param empty_column_thresh:
        The number of non-null values required for a column to be considered
        populated/non-empty. Default is ``1``.

        .. note::
           Rows are dropped before columns, so this threshold will be applied
           to the values that remain **after rows are removed**. If
           ``empty_column_thresh`` is greater than ``1``, the resulting ``df``
           may contain rows with fewer populated cells than
           ``empty_row_thresh``. However, it will never contain completely
           empty rows (unless ``empty_row_thresh`` is ``0``).

        If set to ``0``, no columns will be removed.

    :type empty_column_thresh: int, optional

    :param falsey_is_null:
        Controls whether falsey objects are considered *null-indicating*.
        See :func:`cleaning_functions.clean_null()
        <etl_toolbox.cleaning_functions.clean_null>` for details.
        Default is ``False``.

    :type falsey_is_null: boolean, optional

    :param special_characters:
        A string of special characters to preserve while creating the
        fingerprints. See :func:`cleaning_functions.fingerprint()
        <etl_toolbox.cleaning_functions.fingerprint>` for details.

    :type special_characters: string, optional

    :return:
        Returns ``None``. The ``df`` argument is mutated.
    """

    initial_index_is_default = index_is_default(df)

    # Apply clean_null() to every cell in df
    for i in range(df.shape[0]):
        for j in range(df.shape[1]):
            val = clean_null(
                df.iloc[i, j],
                falsey_is_null=falsey_is_null,
                special_characters=special_characters,
            )
            if val is None:
                df.iloc[i, j] = np.nan

    # Drop rows with fewer populated cells than empty_row_thresh
    df.dropna(axis=0, thresh=empty_row_thresh, inplace=True)

    # Drop columns with fewer populated cells than empty_column_thresh
    df.dropna(axis=1, thresh=empty_column_thresh, inplace=True)

    # Make sure there are no empty rows in the final df
    # (unless empty_row_thresh is 0)
    if empty_column_thresh > 1 and empty_row_thresh != 0:
        df.dropna(axis=0, how='all', inplace=True)

    # Reset the index if it was initially a default index
    if initial_index_is_default:
        df.reset_index(drop=True, inplace=True)