Skip to content

Py-utilz

Dataframe verbs and tools

ejolly/py-utilz

Dataframe verbs and tools

The dfverbs module is intended to be imported as an alias and used inside pipe for dplyr like data manipulation grammar. Using the sample on the redframes README:

import pandas as pd
from utilz import pipe, randdf
import utilz.dfverbs as _

# Define demo df
df = pd.DataFrame({
    'bear': ['Brown bear', 'Polar bear', 'Asian black bear', 'American black bear', 'Sun bear', 'Sloth bear', 'Spectacled bear', 'Giant panda'],
    'genus': ['Ursus', 'Ursus', 'Ursus', 'Ursus', 'Helarctos', 'Melursus', 'Tremarctos', 'Ailuropoda'],
    'weight (male, lbs)': ['300-860', '880-1320', '220-440', '125-500', '60-150', '175-310', '220-340', '190-275'],
    'weight (female, lbs)': ['205-455', '330-550', '110-275', '90-300', '45-90', '120-210', '140-180', '155-220']
})

out = pipe(
    df,
    _.rename({"weight (male, lbs)": "male", "weight (female, lbs)": "female"}),
    _.pivot_longer(columns=["male", "female"], into=("sex", "weight")),
    _.split("weight", ("min", "max"), sep="-"),
    _.pivot_longer(columns=["min", "max"], into=("stat", "weight")),
    _.astype({"weight": float}),
    _.groupby("genus", "sex"),
    _.summarize(weight="weight.mean()"),
    _.pivot_wider(column="sex", using="weight"),
    _.mutate(dimorphism="male / female"),  # no rounding possible
    _.mutate(dimorphism=lambda male, female: np.round(male / female, 2)) # instead use a func
)

Note

The dftools module on the other handed is not intended to be imported at all. Instead it defines new .methods on pandas DataFrame and DataFrameGroupBy objects automatically, e.g. df.select('-Col1') is a new method that allows for R-style column selection.

Verbs

dplyr like verbs for working with pandas dataframes.

`apply(*args, **kwargs)`

Call a dataframe or groupby object's .apply method For groupbed dataframes, resets and drops index by default. Change this with reset_index='drop'|'reset'|'none'

Source code in utilz/dfverbs/verbs.py

@curry
def apply(*args, **kwargs):
    """Call a dataframe or groupby object's `.apply` method
    For groupbed dataframes, resets and drops index by default. Change this with `reset_index='drop'|'reset'|'none'`
    """

    reset_index = kwargs.pop("reset_index", "drop")

    def call(df):
        out = df.apply(*args, **kwargs)
        if isinstance(df, pd.core.groupby.generic.DataFrameGroupBy):
            out = _reset_index_helper(out, reset_index)
        return out

    return call

`assign(**kwargs)`

Call a dataframe object's .assign method

Source code in utilz/dfverbs/verbs.py

@curry
def assign(**kwargs):
    """Call a dataframe object's `.assign` method"""

    def call(df):
        out = df.assign(**kwargs)
        return out

    return call

`astype(cols, df)`

Cast one ore more columns to a type. Like .rename() you can either input a single tuple to cast 1 column or a dict to cast multiple

Source code in utilz/dfverbs/verbs.py

@curry
def astype(cols, df):
    """Cast one ore more columns to a type. Like `.rename()` you can either input a single tuple to cast 1
    column or a dict to cast multiple"""
    if isinstance(cols, tuple):
        cols = {cols[0]: cols[1]}
    return df.astype(cols)

`call(*args, **kwargs)`

Call an arbitrary method or function on an object, e.g. pipe(df, _.call('mean')) would call df.mean()

Source code in utilz/dfverbs/verbs.py

@curry
def call(*args, **kwargs):
    """Call an arbitrary method or function on an object, e.g. `pipe(df,
    _.call('mean'))` would call `df.mean()`"""

    def _call(df):
        method_name = args[0]
        func = getattr(df, method_name, None)
        if func is not None:
            return func(*args[1:], **kwargs)
        else:
            raise AttributeError(f"{type(df)} does not have a {method_name} method")

    return _call

`concat(*args, **kwargs)`

Call pd.concat

Source code in utilz/dfverbs/verbs.py

@curry
def concat(*args, **kwargs):
    """Call pd.concat"""
    return pd.concat(*args, **kwargs)

`drop(*args)`

Call a dataframe's .drop(axis=1) method. Column names should be passed as multiple args like .select(), e.g. _.drop('height', 'weight')

Source code in utilz/dfverbs/verbs.py

@curry
def drop(*args):
    """Call a dataframe's `.drop(axis=1)` method. Column names should be passed as
    multiple args like `.select()`, e.g. `_.drop('height', 'weight')`"""

    def call(df):
        return do("drop", df, [*args], axis=1)

    return call

`fillna(*args, **kwargs)`

Call a dataframe's fillna method

Source code in utilz/dfverbs/verbs.py

@curry
def fillna(*args, **kwargs):
    """Call a dataframe's fillna method"""

    def call(df):
        return df.fillna(*args, **kwargs)

    return call

`groupby(*args)`

Call a dataframe's .groupby method

Source code in utilz/dfverbs/verbs.py

@curry
def groupby(*args):
    """Call a dataframe's `.groupby` method"""

    def call(df):
        return do("groupby", df, [*args])

    return call

`head(*args, **kwargs)`

Call dataframe's .head() method

Source code in utilz/dfverbs/verbs.py

@curry
def head(*args, **kwargs):
    """Call dataframe's `.head()` method"""

    def call(df):
        return df.head(*args, **kwargs)

    return call

`join(*args, **kwargs)`

Call pd.concat

Source code in utilz/dfverbs/verbs.py

@curry
def join(*args, **kwargs):
    """Call pd.concat"""
    return pd.join(*args, **kwargs)

`merge(*args, **kwargs)`

Call pd.concat

Source code in utilz/dfverbs/verbs.py

@curry
def merge(*args, **kwargs):
    """Call pd.concat"""
    return pd.merge(*args, **kwargs)

`mutate(dfg, **kwargs)`

Creates a new column(s) in a DataFrame based on a function of existing columns in the DataFrame. Always returns a dataframe the same size as the original. For groupby inputs, the result is always ungrouped.

Just like .summarize(), input should be kwargs organized like new_column = str| function. Such as: _.mutate(weight_centered ='weight - weight.mean()') or _.mutate(weight_centered = lambda weight: weight - weight.mean()) or _.mutate(weight_centered = lambda df: df['weight].apply(lambda x: x - x.mean()). To return output smaller than the input dataframe use .summarize() instead.

Source code in utilz/dfverbs/verbs.py

@curry
def mutate(dfg, **kwargs):
    """
    Creates a new column(s) in a DataFrame based on a function of existing columns in
    the DataFrame. Always returns a dataframe the same size as the original. For groupby
    inputs, **the result is always ungrouped.**

    Just like `.summarize()`, input should be kwargs organized like `new_column = str|
    function`. Such as: `_.mutate(weight_centered ='weight - weight.mean()')`
     or `_.mutate(weight_centered = lambda weight: weight - weight.mean())` or `_.mutate(weight_centered = lambda df: df['weight].apply(lambda x: x -
     x.mean())`. To return output *smaller* than the input dataframe use `.summarize()` instead.
    """

    if isinstance(dfg, pd.core.groupby.generic.DataFrameGroupBy):
        prev = dfg.obj.copy()
        for _, (k, v) in enumerate(kwargs.items()):
            if isinstance(v, str):
                res = dfg.apply(lambda group: group.eval(v)).reset_index()
            elif callable(v):
                name = v.__code__.co_varnames
                if len(name) == 1:
                    # Normal assign where we pass in the entire dataframe to the calling
                    # function
                    if name[0] in ["df", "g", "group"]:
                        res = dfg.apply(v).reset_index()
                    else:
                        # Single column apply
                        res = dfg.apply(lambda g: v(g[name[0]])).reset_index()
                else:
                    # Multi-columm
                    res = dfg.apply(lambda g: v(*[g[e] for e in name])).reset_index()
            else:
                raise TypeError(
                    f"grouped dataframes cannot make direct assignments. You must pass in a str to be evaluated or a function but you passed in a type{v}"
                )

            # Calling an operation that returns df the same size as the original df,
            # like transform, e.g. 'A1 - A1.mean()'
            if res.shape[0] == prev.shape[0]:
                level_col_idx, level_col_name = [
                    (i, col)
                    for i, col in enumerate(res.columns)
                    if str(col).startswith("level_")
                ][0]

                res = res.rename(columns={res.columns[-1]: k})

                # Allow column overwriting
                if k in prev:
                    prev = prev.drop(columns=k).merge(
                        res.iloc[:, level_col_idx:],
                        left_index=True,
                        right_on=level_col_name,
                    )
                else:
                    # prev = prev.join(res[k])
                    prev = prev.merge(
                        res.iloc[:, level_col_idx:],
                        left_index=True,
                        right_on=level_col_name,
                    )
                prev = prev.drop(columns=level_col_name).reset_index(drop=True)
            else:
                # otherwise operation returns smaller
                # so we need to join on the grouping col which is the name of the first
                # col in the output
                res = res.rename(columns={res.columns[-1]: k})
                # Allow column overwriting
                if k in prev:
                    prev = prev.drop(columns=k).merge(
                        res, on=res.columns[:-1].to_list()
                    )
                else:
                    prev = prev.merge(res, on=res.columns[:-1].to_list())
        return prev
    else:
        out = dfg.copy()
        for k, v in kwargs.items():
            if isinstance(v, str):
                out = out.assign(**{k: dfg.eval(v)})
            elif callable(v):
                name = v.__code__.co_varnames
                if len(name) == 1:
                    # Normal assign where we pass in the entire dataframe to the calling
                    # function
                    if name[0] == "df":
                        out = out.assign(**{k: v})
                    else:
                        # Single column apply
                        out = out.assign(**{k: lambda df: v(df[name[0]])})
                else:
                    # Multi-columm
                    # get columns as list
                    cols = [dfg[e] for e in name]
                    out = out.assign(**{k: v(*cols)})
            else:
                # Normal assignment
                out = out.assign(**{k: v})

        return out

`pivot_longer(*args, **kwargs)`

Convert a list of columns into 2 columns. Can pass a list of columsn to melt-down or id_vars to select everything else: e.g. _.pivot_longer(['male', 'female'], into=('gender', 'response')) or _.pivot_longer(id_vars='SID', into=('gender','response'))

Parameters:

Name	Type	Description	Default
`columns`	`list or None`	columns to melt; Defaults to None	required
`id_vars`	`list or None`	columns to use as id variables; Default to None	required
`into`	`tuple`	cols to create Defaults to ("variable", "value").	required
`make_index`	`bool`	does a reset_index prior to melting and adds the	required

Source code in utilz/dfverbs/verbs.py

@curry
def pivot_longer(*args, **kwargs):
    """
    Convert a list of columns into 2 columns. Can pass a list of columsn to melt-down or
    `id_vars` to select everything else: e.g. `_.pivot_longer(['male', 'female'],
    into=('gender', 'response'))` or `_.pivot_longer(id_vars='SID', into=('gender','response'))`

    Args:
        columns (list or None): columns to melt; Defaults to None
        id_vars (list or None): columns to use as id variables; Default to None
        into (tuple, optional): cols to create Defaults to ("variable", "value").
        make_index (bool, optional): does a reset_index prior to melting and adds the
        index col to id_vars. Defaults to False.

    """

    def call(df):
        return df.pivot_longer(*args, **kwargs)

    return call

`pivot_wider(*args, **kwargs)`

Convert a pair of columns to multiple columns, e.g. _.pivot_wider('condition', using='response')

Parameters:

Name	Type	Description	Default
`column`	`str`	string name of column to "explode"	required
`using`	`str`	string name of column who's values should be placed into the new columns	required
`drop_index`	`bool; optional`	if a 'prev_index' col exists (usually created by	required

Source code in utilz/dfverbs/verbs.py

@curry
def pivot_wider(*args, **kwargs):
    """
    Convert a pair of columns to multiple columns, e.g. `_.pivot_wider('condition', using='response')`

    Args:
        column (str): string name of column to "explode"
        using (str): string name of column who's values should be placed into the new columns
        drop_index (bool; optional): if a 'prev_index' col exists (usually created by
        make_index=True in pivot_longer) will drop it; Default True

    """

    def call(df):
        return df.pivot_wider(*args, **kwargs)

    return call

`query(q, **kwargs)`

Call a dataframe object's .query method. Resets and drops index by default. Change this with reset_index='drop'|'reset'|'none'

Source code in utilz/dfverbs/verbs.py

@curry
def query(q, **kwargs):
    """
    Call a dataframe object's `.query` method. Resets and drops index by
    default. Change this with `reset_index='drop'|'reset'|'none'`
    """
    reset_index = kwargs.pop("reset_index", "drop")

    def call(df):
        if isinstance(q, str):
            df = df.query(q, **kwargs)
        elif callable(q):
            name = q.__code__.co_varnames
            if len(name) == 1:
                if name[0] == "df":
                    df = df.loc[q]
                else:
                    df = df[q(df[name[0]])]
            else:
                df = df[q(*[df[e] for e in name])]

        return _reset_index_helper(df, reset_index)

    return call

`read_csv(*args, **kwargs)`

Call pd.read_csv

Source code in utilz/dfverbs/verbs.py

@curry
def read_csv(*args, **kwargs):
    """Call pd.read_csv"""
    return pd.read_csv(*args, **kwargs)

`rename(cols, df)`

Rename one ore more columns. Can either input a single tuple to rename 1 column or a dict to rename multiple

Source code in utilz/dfverbs/verbs.py

@curry
def rename(cols, df):
    """Rename one ore more columns. Can either input a single tuple to rename 1 column
    or a dict to rename multiple"""
    if isinstance(cols, tuple):
        cols = {cols[0]: cols[1]}
    return df.rename(columns=cols)

`replace(*args, **kwargs)`

Call a dataframe's replace method

Source code in utilz/dfverbs/verbs.py

@curry
def replace(*args, **kwargs):
    """Call a dataframe's replace method"""

    def call(df):
        return df.replace(*args, **kwargs)

    return call

`reset_index(*args, **kwargs)`

Call a dataframe's reset_index method

Source code in utilz/dfverbs/verbs.py

@curry
def reset_index(*args, **kwargs):
    """Call a dataframe's reset_index method"""

    def call(df):
        return df.reset_index(*args, **kwargs)

    return call

`select(*args)`

Select one or more columns by name. Drop one or more columns by prepending '-' to the name. Always returns a dataframe even if there is just 1 column. Does not support renaming

Source code in utilz/dfverbs/verbs.py

@curry
def select(*args):
    """
    Select one or more columns by name. Drop one or more columns by prepending '-' to
    the name. **Always returns a dataframe** even if there is just 1 column. Does not support renaming
    """

    def call(df):
        return do("select", df, *args)

    return call

`sort(*args, **kwargs)`

Sort df by one or more columns passed as args. Ignores index by default by you can change that with ignore_index=False.

Source code in utilz/dfverbs/verbs.py

@curry
def sort(*args, **kwargs):
    """Sort df by one or more columns passed as args. Ignores index by default by you
    can change that with `ignore_index=False`."""
    ignore_index = kwargs.pop("ignore_index", True)

    def call(df):
        return df.sort_values(by=list(args), ignore_index=ignore_index, **kwargs)

    return call

`split(*args, sep=' ')`

Split values in single df column into multiple columns by separator, e.g. First-Last -> [First], [Last]. To split list elements use [] as the sep, e.g. [1,2,3] -> [1], [2], [3]

Parameters:

Name	Type	Description	Default
`column`	`str`	column to split	required
`into`	`list`	new columns names to create	required
`sep`	`str, list`	separator to split on. Use [] for list	`' '`

Source code in utilz/dfverbs/verbs.py

@curry
def split(*args, sep=" "):
    """
    Split values in single df column into multiple columns by separator, e.g.
    First-Last -> [First], [Last]. To split list elements use [] as the sep, e.g.
    [1,2,3] -> [1], [2], [3]

    Args:
        column (str): column to split
        into (list): new columns names to create
        sep (str, list): separator to split on. Use [] for list

    """

    col, into = args

    def call(df):
        if isinstance(sep, str):
            out = df[col].str.split(sep, expand=True)
        elif isinstance(sep, list):
            out = pd.DataFrame(df[col].to_list())
        if len(into) != out.shape[1]:
            raise ValueError(
                f"into has {len(into)} elements, but splitting creates a dataframe with {out.shape[1]} columns"
            )
        else:
            out.columns = list(into)

        return pd.concat([df.drop(columns=col), out], axis=1)

    return call

`splitquery(query, **kwargs)`

Call a dataframe or groupby object's .query method and return 2 dataframes one where containing results where the query is true and its inverse. Resets and drops index by default. Change this with reset_index='drop'|'reset'|'none'

Source code in utilz/dfverbs/verbs.py

@curry
def splitquery(query, **kwargs):
    """
    Call a dataframe or groupby object's `.query` method and return 2 dataframes one
    where containing results where the query is true and its inverse.
    Resets and drops index by default. Change this with `reset_index='drop'|'reset'|'none'`
    """
    reset_index = kwargs.pop("reset_index", "drop")

    def call(df):
        if isinstance(query, str):
            df_yes = df.query(query, **kwargs)
            df_no = df.query(f"not ({query})", **kwargs)
        elif callable(query):
            df_yes = df.loc[query]
            df_no = df.loc[~(query)]

        return (
            _reset_index_helper(df_yes, reset_index),
            _reset_index_helper(df_no, reset_index),
        )

    return call

`squeeze(*args, **kwargs)`

Call a dataframe's .squeeze method

Source code in utilz/dfverbs/verbs.py

@curry
def squeeze(*args, **kwargs):
    """Call a dataframe's `.squeeze` method"""

    def call(df):
        return df.squeeze(*args, **kwargs)

    return call

`summarize(dfg, **kwargs)`

Create new columns based on existing columns in a dataframe but return a smaller dataframe than the original. Works with the output of groupby as well:

Just like .mutate()/.transmute(), input should be kwargs organized like new_column = str| function. Such as: _.summarize(weight_mean ='weight.mean()') or _.summarize(weight_mean = lambda weight: weight.mean()) or _.summarize(weight_mean = lambda df: df['weight].mean()). To return output the same size as the input dataframe use .mutate() or .transmute() instead as either will broadcast values to the right size.

Source code in utilz/dfverbs/verbs.py

@curry
def summarize(dfg, **kwargs):
    """
    Create new columns based on existing columns in a dataframe but return a
    **smaller** dataframe than the original. Works with the output of `groupby` as well:

    Just like `.mutate()/.transmute()`, input should be kwargs organized like
    `new_column = str| function`. Such as: `_.summarize(weight_mean ='weight.mean()')`
    or `_.summarize(weight_mean = lambda weight: weight.mean())` or `_.summarize(weight_mean = lambda df: df['weight].mean())`. To return output the
    same size as the input dataframe use `.mutate()` or `.transmute()` instead as
    either will *broadcast* values to the right size.
    """

    if isinstance(dfg, pd.core.groupby.generic.DataFrameGroupBy):
        out = None
        for k, v in kwargs.items():
            if isinstance(v, str):
                res = dfg.apply(lambda group: group.eval(v)).reset_index()
            elif callable(v):
                name = v.__code__.co_varnames
                if len(name) == 1:
                    if name[0] in ["df", "g", "group"]:
                        res = dfg.apply(v).reset_index()
                    else:
                        # Single column summarize
                        res = dfg.apply(lambda g: v(g[name[0]])).reset_index()
                else:
                    # Multi-column summarize
                    res = dfg.apply(lambda g: v(*[g[e] for e in name])).reset_index()
            else:
                raise TypeError(
                    f"summarize expects input kwargs organized like: new_colname = str | func, but receive type: {type(v)}"
                )
            res = res.rename(columns={res.columns[-1]: k})
            if not res.shape[0] < dfg.obj.shape[0]:
                raise ValueError(
                    "functions and expressions received by summarize should return a scalar output. If you want to broadcast this value over the entire dataframe use assign() instead."
                )
            if out is None:
                out = res
            else:
                out = out.drop(columns=k, errors="ignore").merge(
                    res, on=res.columns[:-1].to_list()
                )
        return out
    elif isinstance(dfg, pd.DataFrame):
        out = dict()
        for k, v in kwargs.items():
            if isinstance(v, str):
                out[k] = dfg.eval(v)
            elif callable(v):
                name = v.__code__.co_varnames
                if len(name) == 1:
                    if name[0] == "df":
                        out[k] = v(dfg)
                    else:
                        # Single column summarize
                        out[k] = v(dfg[name[0]])
                else:
                    # multi-col summarize
                    cols = [dfg[e] for e in name]
                    out[k] = v(*cols)
            else:
                raise TypeError(
                    f"summarized expects input kwargs organized like: new_colname = str | func, but receive type: {type(v)}"
                )

        return pd.DataFrame(out, index=[0])
    else:
        raise TypeError(
            f"summarize expected previous step to be a DataFrame or GroupBy, but received a {type(dfg)}. If you used select(), you should instead select the column in the expression or function passed to summarize(new_col='old_col.mean()'). If you intended to run an expression summarize takes kwargs organized like: new_colname = str | func. This differs from agg in pandas which expects a column name and expression!"
        )

`tail(*args, **kwargs)`

Call dataframe's .tail() method

Source code in utilz/dfverbs/verbs.py

@curry
def tail(*args, **kwargs):
    """Call dataframe's `.tail()` method"""

    def call(df):
        return df.tail(*args, **kwargs)

    return call

`to_csv(path, df, index=False)`

Call a dataframe's .to_csv(index=False) method

Source code in utilz/dfverbs/verbs.py

@curry
def to_csv(path, df, index=False):
    """Call a dataframe's `.to_csv(index=False)` method"""
    if not str(path).endswith(".csv"):
        path = f"{path}.csv"
    df.to_csv(f"{path}", index=index)
    return df

`to_list(*args, **kwargs)`

Call a dataframe's .to_list method

Source code in utilz/dfverbs/verbs.py

@curry
def to_list(*args, **kwargs):
    """Call a dataframe's `.to_list` method"""

    def call(df):
        return df.to_list(*args, **kwargs)

    return call

`to_numpy(*args, **kwargs)`

Call a dataframe's .to_numpy method

Source code in utilz/dfverbs/verbs.py

@curry
def to_numpy(*args, **kwargs):
    """Call a dataframe's `.to_numpy` method"""

    def call(df):
        return df.to_numpy(*args, **kwargs)

    return call

`transmute(dfg, **kwargs)`

Just like .mutate(), but only returns the newly created columns.

Source code in utilz/dfverbs/verbs.py

@curry
def transmute(dfg, **kwargs):
    """Just like `.mutate()`, but only returns the newly created columns."""
    if isinstance(
        dfg,
        (
            pd.core.groupby.generic.DataFrameGroupBy,
            pd.core.groupby.generic.SeriesGroupBy,
        ),
    ):
        orig = dfg.obj
    else:
        orig = dfg
    out = mutate(dfg, **kwargs)
    cols = filter(list(orig.columns), list(out.columns), substr_match=False)
    out = out.drop(columns=cols)

    if out.shape[1] < 1:
        raise ValueError(
            "transmute does not support reassigning to an existing column. Give your new column(s) a different name(s) to extract"
        )
    else:
        return out

Stats

dataframe stats methods

`abs(*args, **kwargs)`

Call df.abs

Source code in utilz/dfverbs/stats.py

@curry
def abs(*args, **kwargs):
    """Call df.abs"""

    def call(df):
        return df.abs(*args, **kwargs)

    return call

`all(*args, **kwargs)`

Call df.all

Source code in utilz/dfverbs/stats.py

@curry
def all(*args, **kwargs):
    """Call df.all"""

    def call(df):
        return df.all(*args, **kwargs)

    return call

`any(*args, **kwargs)`

Call df.any

Source code in utilz/dfverbs/stats.py

@curry
def any(*args, **kwargs):
    """Call df.any"""

    def call(df):
        return df.any(*args, **kwargs)

    return call

`bootci(col, **kwargs)`

Calculate 95% bootstrapped confidence intervals on the mean of a column. Unlike summarize, bootci expects a string column name and will return a summary frame with columns for the mean, 2.5% and 97.% confidence limits. Use as_devation=True to convert the CIs to deviations from the mean. Accepts all the same args as seaborn.algorithms.bootstrap, e.g. units.

Source code in utilz/dfverbs/stats.py

@curry
def bootci(col, **kwargs):
    """Calculate 95% bootstrapped confidence intervals on the mean of a column. Unlike
    summarize, bootci expects a string column name and will return a summary frame with
    columns for the mean, 2.5% and 97.% confidence limits. Use `as_devation=True` to
    convert the CIs to deviations from the mean. Accepts all the same args as
    `seaborn.algorithms.bootstrap`, e.g. `units`."""

    deviation = kwargs.pop("as_deviation", False)

    def call(df):
        if isinstance(df, pd.core.groupby.generic.DataFrameGroupBy):
            units = kwargs.pop("units", None)

            cis = pipe(
                df,
                apply(
                    lambda g: sns.utils.ci(
                        sns.algorithms.bootstrap(
                            g[col],
                            units=g[units] if units is not None else None,
                            **kwargs,
                        )
                    ),
                    reset_index="reset",
                ),
                split(0, [f"{col}_ci_l", f"{col}_ci_u"], sep=[]),
            )
            summary = pipe(df, summarize(**{f"{col}_mean": f"{col}.mean()"}))
            matching_cols = filter(summary.columns, cis.columns)
            cis = pipe(cis, merge(summary, on=matching_cols))
            if deviation:
                cis = pipe(
                    cis,
                    mutate(
                        **{
                            f"{col}_ci_l": f"{col}_mean - {col}_ci_l",
                            f"{col}_ci_u": f"{col}_ci_u - {col}_mean",
                        },
                    ),
                )

            return cis
        else:
            raise TypeError(
                "bootci only works on grouped dataframes, trying call _.groupby before"
            )

    return call

`corr(*args, **kwargs)`

Call df.corr

Source code in utilz/dfverbs/stats.py

@curry
def corr(*args, **kwargs):
    """Call df.corr"""

    def call(df):
        return df.corr(*args, **kwargs)

    return call

`count(*args, **kwargs)`

Call df.count

Source code in utilz/dfverbs/stats.py

@curry
def count(*args, **kwargs):
    """Call df.count"""

    def call(df):
        return df.count(*args, **kwargs)

    return call

`cov(*args, **kwargs)`

Call df.cov

Source code in utilz/dfverbs/stats.py

@curry
def cov(*args, **kwargs):
    """Call df.cov"""

    def call(df):
        return df.cov(*args, **kwargs)

    return call

`max(*args, **kwargs)`

Call df.max

Source code in utilz/dfverbs/stats.py

@curry
def max(*args, **kwargs):
    """Call df.max"""

    def call(df):
        return df.max(*args, **kwargs)

    return call

`mean(*args, **kwargs)`

Call df.mean

Source code in utilz/dfverbs/stats.py

@curry
def mean(*args, **kwargs):
    """Call df.mean"""

    def call(df):
        return df.mean(*args, **kwargs)

    return call

`median(*args, **kwargs)`

Call df.median

Source code in utilz/dfverbs/stats.py

@curry
def median(*args, **kwargs):
    """Call df.median"""

    def call(df):
        return df.median(*args, **kwargs)

    return call

`min(*args, **kwargs)`

Call df.min

Source code in utilz/dfverbs/stats.py

@curry
def min(*args, **kwargs):
    """Call df.min"""

    def call(df):
        return df.min(*args, **kwargs)

    return call

`mode(*args, **kwargs)`

Call df.mode

Source code in utilz/dfverbs/stats.py

@curry
def mode(*args, **kwargs):
    """Call df.mode"""

    def call(df):
        return df.mode(*args, **kwargs)

    return call

`nunique(*args, **kwargs)`

Call df.nunique

Source code in utilz/dfverbs/stats.py

@curry
def nunique(*args, **kwargs):
    """Call df.nunique"""

    def call(df):
        out = df.nunique(*args, **kwargs)
        return out.reset_index().rename(columns={"index": "column", 0: "nunique"})

    return call

`prod(*args, **kwargs)`

Call df.prod

Source code in utilz/dfverbs/stats.py

@curry
def prod(*args, **kwargs):
    """Call df.prod"""

    def call(df):
        return df.prod(*args, **kwargs)

    return call

`rank(*args, **kwargs)`

Call df.rank

Source code in utilz/dfverbs/stats.py

@curry
def rank(*args, **kwargs):
    """Call df.rank"""

    def call(df):
        return df.rank(*args, **kwargs)

    return call

`round(*args, **kwargs)`

Call df.round

Source code in utilz/dfverbs/stats.py

@curry
def round(*args, **kwargs):
    """Call df.round"""

    def call(df):
        return df.round(*args, **kwargs)

    return call

`sem(*args, **kwargs)`

Call df.sem

Source code in utilz/dfverbs/stats.py

@curry
def sem(*args, **kwargs):
    """Call df.sem"""

    def call(df):
        return df.sem(*args, **kwargs)

    return call

`size(*args, **kwargs)`

Call df.size

Source code in utilz/dfverbs/stats.py

@curry
def size(*args, **kwargs):
    """Call df.size"""

    def call(df):
        return df.size(*args, **kwargs)

    return call

`sqrt(*args, **kwargs)`

Call df.sqrt

Source code in utilz/dfverbs/stats.py

@curry
def sqrt(*args, **kwargs):
    """Call df.sqrt"""

    def call(df):
        return df.sqrt(*args, **kwargs)

    return call

`std(*args, **kwargs)`

Call df.std

Source code in utilz/dfverbs/stats.py

@curry
def std(*args, **kwargs):
    """Call df.std"""

    def call(df):
        return df.std(*args, **kwargs)

    return call

`sum(*args, **kwargs)`

Call df.sum

Source code in utilz/dfverbs/stats.py

@curry
def sum(*args, **kwargs):
    """Call df.sum"""

    def call(df):
        return df.sum(*args, **kwargs)

    return call

`unique(*args, **kwargs)`

Call df.unique

Source code in utilz/dfverbs/stats.py

@curry
def unique(*args, **kwargs):
    """Call df.unique"""

    def call(df):
        # Unique only exists on series, so squeeze down single col df or loop over cols
        if df.shape[1] == 1:
            out = df.squeeze().unique(*args, **kwargs)
        else:
            out = df.apply(lambda col: col.unique(*args, **kwargs))

        # just one col
        if isinstance(out, np.ndarray):
            out = pd.DataFrame(out, columns=["unique"])
            out["column"] = df.columns[0]
            return out[["column", "unique"]]
        return out.reset_index().rename(columns={"index": "column", 0: "unique"})

    return call

`value_counts(*args, **kwargs)`

Call df.value_counts

Source code in utilz/dfverbs/stats.py

@curry
def value_counts(*args, **kwargs):
    """Call df.value_counts"""

    def call(df):
        out = df.value_counts(*args, **kwargs)
        return out.reset_index().rename(columns={"index": "column", 0: "count"})

    return call

`var(*args, **kwargs)`

Call df.var

Source code in utilz/dfverbs/stats.py

@curry
def var(*args, **kwargs):
    """Call df.var"""

    def call(df):
        return df.var(*args, **kwargs)

    return call

Plots

plotting verbs to wrap calls to seaborn

`barplot(**kwargs)`

Call to seaborn barplot

Source code in utilz/dfverbs/plot.py

@curry
def barplot(**kwargs):
    """Call to seaborn barplot"""

    def plot(data):
        return sns.barplot(data=data, ax=newax(), **kwargs)

    return plot

`boxenplot(**kwargs)`

Call to seaborn boxenplot

Source code in utilz/dfverbs/plot.py

@curry
def boxenplot(**kwargs):
    """Call to seaborn boxenplot"""

    def plot(data):
        return sns.boxenplot(data=data, ax=newax(), **kwargs)

    return plot

`boxplot(**kwargs)`

Call to seaborn boxplot

Source code in utilz/dfverbs/plot.py

@curry
def boxplot(**kwargs):
    """Call to seaborn boxplot"""

    def plot(data):
        return sns.boxplot(data=data, ax=newax(), **kwargs)

    return plot

`catplot(**kwargs)`

Call to seaborn catplot

Source code in utilz/dfverbs/plot.py

@curry
def catplot(**kwargs):
    """Call to seaborn catplot"""

    def plot(data):
        return sns.catplot(data=data, **kwargs)

    return plot

`clustermap(**kwargs)`

Call to seaborn clustermap

Source code in utilz/dfverbs/plot.py

@curry
def clustermap(**kwargs):
    """Call to seaborn clustermap"""

    def plot(data):
        return sns.clustermap(data=data, **kwargs)

    return plot

`countplot(**kwargs)`

Call to seaborn countplot

Source code in utilz/dfverbs/plot.py

@curry
def countplot(**kwargs):
    """Call to seaborn countplot"""

    def plot(data):
        return sns.countplot(data=data, ax=newax(), **kwargs)

    return plot

`displot(**kwargs)`

Call to seaborn displot

Source code in utilz/dfverbs/plot.py

@curry
def displot(**kwargs):
    """Call to seaborn displot"""

    def plot(data):
        return sns.displot(data=data, **kwargs)

    return plot

`ecdfplot(**kwargs)`

Call to seaborn ecdfplot

Source code in utilz/dfverbs/plot.py

@curry
def ecdfplot(**kwargs):
    """Call to seaborn ecdfplot"""

    def plot(data):
        return sns.ecdfplot(data=data, ax=newax(), **kwargs)

    return plot

`heatmap(**kwargs)`

Call to seaborn heatmap

Source code in utilz/dfverbs/plot.py

@curry
def heatmap(**kwargs):
    """Call to seaborn heatmap"""

    def plot(data):
        return sns.heatmap(data=data, ax=newax(), **kwargs)

    return plot

`histplot(**kwargs)`

Call to seaborn histplot

Source code in utilz/dfverbs/plot.py

@curry
def histplot(**kwargs):
    """Call to seaborn histplot"""

    def plot(data):
        return sns.histplot(data=data, ax=newax(), **kwargs)

    return plot

`jointplot(**kwargs)`

Call to seaborn jointplot

Source code in utilz/dfverbs/plot.py

@curry
def jointplot(**kwargs):
    """Call to seaborn jointplot"""

    def plot(data):
        return sns.jointplot(data=data, ax=newax(), **kwargs)

    return plot

`kdeplot(**kwargs)`

Call to seaborn kdeplot

Source code in utilz/dfverbs/plot.py

@curry
def kdeplot(**kwargs):
    """Call to seaborn kdeplot"""

    def plot(data):
        return sns.kdeplot(data=data, ax=newax(), **kwargs)

    return plot

`lineplot(**kwargs)`

Call to seaborn lineplot

Source code in utilz/dfverbs/plot.py

@curry
def lineplot(**kwargs):
    """Call to seaborn lineplot"""

    def plot(data):
        return sns.lineplot(data=data, ax=newax(), **kwargs)

    return plot

`lmplot(**kwargs)`

Call to seaborn lmplot

Source code in utilz/dfverbs/plot.py

@curry
def lmplot(**kwargs):
    """Call to seaborn lmplot"""

    def plot(data):
        return sns.lmplot(data=data, **kwargs)

    return plot

`pairplot(**kwargs)`

Call to seaborn pairplot

Source code in utilz/dfverbs/plot.py

@curry
def pairplot(**kwargs):
    """Call to seaborn pairplot"""

    def plot(data):
        return sns.pairplot(data=data, **kwargs)

    return plot

`plot(*args, **kwargs)`

Call a dataframe's .plot method

Source code in utilz/dfverbs/plot.py

@curry
def plot(*args, **kwargs):
    """Call a dataframe's .plot method"""

    def call(df):
        return df.plot(*args, **kwargs)

    return call

`pointplot(**kwargs)`

Call to seaborn pointplot

Source code in utilz/dfverbs/plot.py

@curry
def pointplot(**kwargs):
    """Call to seaborn pointplot"""

    def plot(data):
        return sns.pointplot(data=data, ax=newax(), **kwargs)

    return plot

`regplot(**kwargs)`

Call to seaborn regplot

Source code in utilz/dfverbs/plot.py

@curry
def regplot(**kwargs):
    """Call to seaborn regplot"""

    def plot(data):
        return sns.regplot(data=data, ax=newax(), **kwargs)

    return plot

`relplot(**kwargs)`

Call to seaborn relplot

Source code in utilz/dfverbs/plot.py

@curry
def relplot(**kwargs):
    """Call to seaborn relplot"""

    def plot(data):
        return sns.relplot(data=data, **kwargs)

    return plot

`residplot(**kwargs)`

Call to seaborn residplot

Source code in utilz/dfverbs/plot.py

@curry
def residplot(**kwargs):
    """Call to seaborn residplot"""

    def plot(data):
        return sns.residplot(data=data, ax=newax(), **kwargs)

    return plot

`rugplot(**kwargs)`

Call to seaborn rugplot

Source code in utilz/dfverbs/plot.py

@curry
def rugplot(**kwargs):
    """Call to seaborn rugplot"""

    def plot(data):
        return sns.rugplot(data=data, ax=newax(), **kwargs)

    return plot

`scatterplot(**kwargs)`

Call to seaborn scatterplot

Source code in utilz/dfverbs/plot.py

@curry
def scatterplot(**kwargs):
    """Call to seaborn scatterplot"""

    def plot(data):
        return sns.scatterplot(data=data, ax=newax(), **kwargs)

    return plot

`stripbarplot(**kwargs)`

Call to combined stripplot and barplot. See utilz.plot.stripbarplot

Source code in utilz/dfverbs/plot.py

@curry
def stripbarplot(**kwargs):
    """Call to combined stripplot and barplot. See utilz.plot.stripbarplot"""

    def plot(data):
        ax = kwargs.pop("ax", "newax")
        return _stripbarplot(data=data, ax=ax, **kwargs)

    return plot

`stripplot(**kwargs)`

Call to seaborn stripplot

Source code in utilz/dfverbs/plot.py

@curry
def stripplot(**kwargs):
    """Call to seaborn stripplot"""

    def plot(data):
        return sns.stripplot(data=data, ax=newax(), **kwargs)

    return plot

`swarmplot(**kwargs)`

Call to seaborn swarmplot

Source code in utilz/dfverbs/plot.py

@curry
def swarmplot(**kwargs):
    """Call to seaborn swarmplot"""

    def plot(data):
        return sns.swarmplot(data=data, ax=newax(), **kwargs)

    return plot

`violinplot(**kwargs)`

Call to seaborn violinplot

Source code in utilz/dfverbs/plot.py

@curry
def violinplot(**kwargs):
    """Call to seaborn violinplot"""

    def plot(data):
        return sns.violinplot(data=data, ax=newax(), **kwargs)

    return plot

`utilz.dftools`

Common data operations and transformations often on pandas dataframes. This creates new dataframe methods that can be called like this:

df.norm_by_group(grpcol='Class', valcol='Score')

`assert_balanced_groups(df, grpcols, size=None)`

Check if each group of grpcols has the same dimensions

Parameters:

Name	Type	Description	Default
`df`	`pd.DataFrame`	input dataframe	required
`group_cols`	`str/List`	column names to group on in dataframe	required
`shape`	`tuple/None`	optional group sizes to ensure	required

Source code in utilz/dftools.py

@_register_dataframe_method
def assert_balanced_groups(df, grpcols: Union[str, List], size=None):
    """
    Check if each group of `grpcols` has the same dimensions

    Args:
        df (pd.DataFrame): input dataframe
        group_cols (str/List): column names to group on in dataframe
        shape (tuple/None, optional): optional group sizes to ensure
    """

    grouped = df.groupby(grpcols).size()
    size = grouped[0] if size is None else size
    if not np.all(grouped == size):
        raise AssertionError(f"Group sizes don't match!\n{grouped}")
    else:
        return True

`assert_same_nunique(df, grpcols, valcol, size=None)`

Check if each group has the same number of unique values in valcol

Parameters:

Name	Type	Description	Default
`df`	`pd.DataFrame`	input dataframe	required
`valcol`	`str`	column to check unique values in	required
`grpcols`	`str/list`	column names to group on in dataframe, Default None	required
`shape`	`tuple/None`	optional sizes to ensure	required

Source code in utilz/dftools.py

@_register_dataframe_method
def assert_same_nunique(df, grpcols: Union[str, List], valcol: str, size=None):
    """
    Check if each group has the same number of unique values in `valcol`

    Args:
        df (pd.DataFrame): input dataframe
        valcol (str): column to check unique values in
        grpcols (str/list): column names to group on in dataframe, Default None
        shape (tuple/None, optional): optional sizes to ensure
    """

    grouped = df.groupby(grpcols)[valcol].nunique()
    size = grouped[0] if size is None else size
    if not np.all(grouped == size):
        raise AssertionError(f"Groups don't have same nunique values!\n{grouped}")
    else:
        return True

`norm_by_group(df, grpcol, valcols, center=True, scale=True, addcol=True)`

Normalize values in one or more columns separately per group

Parameters:

Name	Type	Description	Default
`df`	`pd.DataFrame`	input dataframe	required
`grpcols`	`str`	grouping col	required
`valcols`	`Union[str, List]`	value cols	required
`center`	`bool`	mean center. Defaults to True.	`True`
`scale`	`bool`	divide by standard deviation. Defaults to True.	`True`

Source code in utilz/dftools.py

@_register_dataframe_method
def norm_by_group(df, grpcol, valcols, center=True, scale=True, addcol=True):
    """
    Normalize values in one or more columns separately per group

    Args:
        df (pd.DataFrame): input dataframe
        grpcols (str): grouping col
        valcols (Union[str, List]): value cols
        center (bool, optional): mean center. Defaults to True.
        scale (bool, optional): divide by standard deviation. Defaults to True.
    """

    def _norm(dat, center, scale):
        if center:
            dat = dat - dat.mean()
        if scale:
            dat = dat / dat.std()
        return dat

    if isinstance(grpcol, List):
        raise NotImplementedError("Grouping by multiple columns is not supported")

    if not isinstance(valcols, List):
        valcols = [valcols]

    out = df.groupby(grpcol)[valcols].transform(_norm, center, scale)

    if addcol:
        if center and not scale:
            idx = "centered"
        elif scale and not center:
            idx = "scaled"
        elif center and scale:
            idx = "normed"

        out = out.to_dict()
        assign_dict = {}
        for key in out.keys():
            assign_dict[f"{key}_{idx}_by_{grpcol}"] = out[key]
        out = df.assign(**assign_dict)
    return out.squeeze()

`pivot_longer(df, columns=None, id_vars=None, into=('variable', 'value'), make_index=False)`

Take multiple columns or multiple id_vars and melt them into 2 columns. If columns is provided, id_vars is inferred and visa-versa. If make_index=True, will use the current index as a new id_var to ensure a unique index.

Parameters:

Name	Type	Description	Default
`df`	`pd.DataFrame`	input DataFrame	required
`columns`	`list or None`	columns to melt; Defaults to None	`None`
`id_vars`	`list or None`	columns to use as id variables; Default to None	`None`
`into`	`tuple`	cols to create Defaults to ("variable", "value").	`('variable', 'value')`
`make_index`	`bool`	does a reset_index prior to melting and adds the	`False`

Source code in utilz/dftools.py

@_register_dataframe_method
def pivot_longer(
    df, columns=None, id_vars=None, into=("variable", "value"), make_index=False
):
    """
    Take multiple columns or multiple id_vars and melt them into 2 columns. If columns
    is provided, id_vars is inferred and visa-versa. If make_index=True, will use the
    current index as a new id_var to ensure a unique index.

    Args:
        df (pd.DataFrame): input DataFrame
        columns (list or None): columns to melt; Defaults to None
        id_vars (list or None): columns to use as id variables; Default to None
        into (tuple, optional): cols to create Defaults to ("variable", "value").
        make_index (bool, optional): does a reset_index prior to melting and adds the
        index col to id_vars. Defaults to False.

    """
    # User provide list of columns to gather -> like in R
    if columns is not None:
        # Grab remaining columns if id_vars isn't provided
        if id_vars is None:
            id_vars = [col for col in df.columns if col not in columns]
    else:
        if id_vars is not None:
            columns = [col for col in df.columns if col not in id_vars]

    if make_index:
        df = df.reset_index().rename(columns={"index": "prev_index"})
        if id_vars is None:
            id_vars = "prev_index"
        else:
            id_vars = list(id_vars) + ["prev_index"]

    df = df.melt(
        id_vars=id_vars,
        value_vars=columns,
        var_name=into[0],
        value_name=into[1],
    )
    return df

`pivot_wider(df, column, using, drop_index=True)`

Cast a column of long-form tidy data to a set of wide columns based on the values in a another column ('using')

Parameters:

Name	Type	Description	Default
`df`	`pd.DataFrame`	input dataframe	required
`column`	`str`	string name of column to "explode"	required
`using`	`str`	string name of column who's values should be placed into the new	required
`drop_index`	`bool; optional`	if a 'prev_index' col exists (usually created by	`True`

Source code in utilz/dftools.py

@_register_dataframe_method
def pivot_wider(df, column, using, drop_index=True):
    """
    Cast a column of long-form tidy data to a set of wide columns based on the values in
    a another column ('using')

    Args:
        df (pd.DataFrame): input dataframe
        column (str): string name of column to "explode"
        using (str): string name of column who's values should be placed into the new
        columns
        drop_index (bool; optional): if a 'prev_index' col exists (usually created by
        make_index=True in pivot_longer) will drop it; Default True

    """
    index = [col for col in df.columns if col not in [column, using]]
    try:
        out = df.pivot(
            index=index,
            columns=column,
            values=using,
        ).reset_index()
        if drop_index:
            out = out.drop(columns=["prev_index"], errors="ignore")
        return out
    except ValueError as e:
        if "duplicate" in str(e):
            print(
                f"ERROR: It's not possible to infer what rows are unique from columns that make up the index: {index}. If you have multiple observations per index, then you should use .pivot_table and decide how to *aggregate* these observations. Otherwise .pivot_longer() can create a unique index for with make_index = True"
            )
        raise e

`select(df, *args, **kwargs)`

Select one ore more columns by name. Drop one or more columns by prepending '-' to the name. Rename columns using keyword arguments.

Examples:

>>> # Grab 2 columns
>>> df.select('sepal_width', 'petal_width')

>>> # Get all columns except one
>>> df.select('-sepal_width')

>>> # Grab a column and rename it
>>> df.select(sepal_width='width')

Source code in utilz/dftools.py

@_register_dataframe_method
def select(df, *args, **kwargs):
    """
    Select one ore more columns by name. Drop one or more columns by prepending '-' to
    the name. Rename columns using keyword arguments.

    Examples:

        >>> # Grab 2 columns
        >>> df.select('sepal_width', 'petal_width')


        >>> # Get all columns except one
        >>> df.select('-sepal_width')

        >>> # Grab a column and rename it
        >>> df.select(sepal_width='width')

    """
    # "Select as" functionality; get col and rename
    if kwargs:
        if args:
            raise ValueError(
                "mixing arguments and keyword arguments is not supported. If you want to filter columns and rename them, you should instead chain multiple calls to .select. For example: df.select('-sepal_length').select(petal_width='width', species='flower')"
            )
        cols = list(kwargs.keys())
        return df.filter(items=cols, axis="columns").rename(columns=kwargs)

    # Get col via name or exclude -name
    col_list = [*args]
    # Split columns to keep and drop based on '-' prefix
    drop, keep = filter("-", col_list, invert="split", assert_notempty=False)
    # Remove the prefix
    if len(drop):
        drop = mapcat(lambda col: col[1:], drop)
    if len(keep):
        return df.drop(columns=drop).filter(items=keep, axis="columns")
    return df.drop(columns=drop)