Skip to content

Dataframe verbs and tools

The dfverbs module is intended to be imported as an alias and used inside pipe for dplyr like data manipulation grammar. Using the sample on the redframes README:

import pandas as pd
from utilz import pipe, randdf
import utilz.dfverbs as _

# Define demo df
df = pd.DataFrame({
    'bear': ['Brown bear', 'Polar bear', 'Asian black bear', 'American black bear', 'Sun bear', 'Sloth bear', 'Spectacled bear', 'Giant panda'],
    'genus': ['Ursus', 'Ursus', 'Ursus', 'Ursus', 'Helarctos', 'Melursus', 'Tremarctos', 'Ailuropoda'],
    'weight (male, lbs)': ['300-860', '880-1320', '220-440', '125-500', '60-150', '175-310', '220-340', '190-275'],
    'weight (female, lbs)': ['205-455', '330-550', '110-275', '90-300', '45-90', '120-210', '140-180', '155-220']
})

out = pipe(
    df,
    _.rename({"weight (male, lbs)": "male", "weight (female, lbs)": "female"}),
    _.pivot_longer(columns=["male", "female"], into=("sex", "weight")),
    _.split("weight", ("min", "max"), sep="-"),
    _.pivot_longer(columns=["min", "max"], into=("stat", "weight")),
    _.astype({"weight": float}),
    _.groupby("genus", "sex"),
    _.summarize(weight="weight.mean()"),
    _.pivot_wider(column="sex", using="weight"),
    _.mutate(dimorphism="male / female"),  # no rounding possible
    _.mutate(dimorphism=lambda male, female: np.round(male / female, 2)) # instead use a func
)

Note

The dftools module on the other handed is not intended to be imported at all. Instead it defines new .methods on pandas DataFrame and DataFrameGroupBy objects automatically, e.g. df.select('-Col1') is a new method that allows for R-style column selection.

Verbs

dplyr like verbs for working with pandas dataframes.

apply(*args, **kwargs)

Call a dataframe or groupby object's .apply method For groupbed dataframes, resets and drops index by default. Change this with reset_index='drop'|'reset'|'none'

Source code in utilz/dfverbs/verbs.py
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
@curry
def apply(*args, **kwargs):
    """Call a dataframe or groupby object's `.apply` method
    For groupbed dataframes, resets and drops index by default. Change this with `reset_index='drop'|'reset'|'none'`
    """

    reset_index = kwargs.pop("reset_index", "drop")

    def call(df):
        out = df.apply(*args, **kwargs)
        if isinstance(df, pd.core.groupby.generic.DataFrameGroupBy):
            out = _reset_index_helper(out, reset_index)
        return out

    return call

assign(**kwargs)

Call a dataframe object's .assign method

Source code in utilz/dfverbs/verbs.py
213
214
215
216
217
218
219
220
221
@curry
def assign(**kwargs):
    """Call a dataframe object's `.assign` method"""

    def call(df):
        out = df.assign(**kwargs)
        return out

    return call

astype(cols, df)

Cast one ore more columns to a type. Like .rename() you can either input a single tuple to cast 1 column or a dict to cast multiple

Source code in utilz/dfverbs/verbs.py
512
513
514
515
516
517
518
@curry
def astype(cols, df):
    """Cast one ore more columns to a type. Like `.rename()` you can either input a single tuple to cast 1
    column or a dict to cast multiple"""
    if isinstance(cols, tuple):
        cols = {cols[0]: cols[1]}
    return df.astype(cols)

call(*args, **kwargs)

Call an arbitrary method or function on an object, e.g. pipe(df, _.call('mean')) would call df.mean()

Source code in utilz/dfverbs/verbs.py
533
534
535
536
537
538
539
540
541
542
543
544
545
546
@curry
def call(*args, **kwargs):
    """Call an arbitrary method or function on an object, e.g. `pipe(df,
    _.call('mean'))` would call `df.mean()`"""

    def _call(df):
        method_name = args[0]
        func = getattr(df, method_name, None)
        if func is not None:
            return func(*args[1:], **kwargs)
        else:
            raise AttributeError(f"{type(df)} does not have a {method_name} method")

    return _call

concat(*args, **kwargs)

Call pd.concat

Source code in utilz/dfverbs/verbs.py
112
113
114
115
@curry
def concat(*args, **kwargs):
    """Call pd.concat"""
    return pd.concat(*args, **kwargs)

drop(*args)

Call a dataframe's .drop(axis=1) method. Column names should be passed as multiple args like .select(), e.g. _.drop('height', 'weight')

Source code in utilz/dfverbs/verbs.py
414
415
416
417
418
419
420
421
422
@curry
def drop(*args):
    """Call a dataframe's `.drop(axis=1)` method. Column names should be passed as
    multiple args like `.select()`, e.g. `_.drop('height', 'weight')`"""

    def call(df):
        return do("drop", df, [*args], axis=1)

    return call

fillna(*args, **kwargs)

Call a dataframe's fillna method

Source code in utilz/dfverbs/verbs.py
574
575
576
577
578
579
580
581
@curry
def fillna(*args, **kwargs):
    """Call a dataframe's fillna method"""

    def call(df):
        return df.fillna(*args, **kwargs)

    return call

groupby(*args)

Call a dataframe's .groupby method

Source code in utilz/dfverbs/verbs.py
87
88
89
90
91
92
93
94
@curry
def groupby(*args):
    """Call a dataframe's `.groupby` method"""

    def call(df):
        return do("groupby", df, [*args])

    return call

head(*args, **kwargs)

Call dataframe's .head() method

Source code in utilz/dfverbs/verbs.py
394
395
396
397
398
399
400
401
@curry
def head(*args, **kwargs):
    """Call dataframe's `.head()` method"""

    def call(df):
        return df.head(*args, **kwargs)

    return call

join(*args, **kwargs)

Call pd.concat

Source code in utilz/dfverbs/verbs.py
124
125
126
127
@curry
def join(*args, **kwargs):
    """Call pd.concat"""
    return pd.join(*args, **kwargs)

merge(*args, **kwargs)

Call pd.concat

Source code in utilz/dfverbs/verbs.py
118
119
120
121
@curry
def merge(*args, **kwargs):
    """Call pd.concat"""
    return pd.merge(*args, **kwargs)

mutate(dfg, **kwargs)

Creates a new column(s) in a DataFrame based on a function of existing columns in the DataFrame. Always returns a dataframe the same size as the original. For groupby inputs, the result is always ungrouped.

Just like .summarize(), input should be kwargs organized like new_column = str| function. Such as: _.mutate(weight_centered ='weight - weight.mean()') or _.mutate(weight_centered = lambda weight: weight - weight.mean()) or _.mutate(weight_centered = lambda df: df['weight].apply(lambda x: x - x.mean()). To return output smaller than the input dataframe use .summarize() instead.

Source code in utilz/dfverbs/verbs.py
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
@curry
def mutate(dfg, **kwargs):
    """
    Creates a new column(s) in a DataFrame based on a function of existing columns in
    the DataFrame. Always returns a dataframe the same size as the original. For groupby
    inputs, **the result is always ungrouped.**

    Just like `.summarize()`, input should be kwargs organized like `new_column = str|
    function`. Such as: `_.mutate(weight_centered ='weight - weight.mean()')`
     or `_.mutate(weight_centered = lambda weight: weight - weight.mean())` or `_.mutate(weight_centered = lambda df: df['weight].apply(lambda x: x -
     x.mean())`. To return output *smaller* than the input dataframe use `.summarize()` instead.
    """

    if isinstance(dfg, pd.core.groupby.generic.DataFrameGroupBy):
        prev = dfg.obj.copy()
        for _, (k, v) in enumerate(kwargs.items()):
            if isinstance(v, str):
                res = dfg.apply(lambda group: group.eval(v)).reset_index()
            elif callable(v):
                name = v.__code__.co_varnames
                if len(name) == 1:
                    # Normal assign where we pass in the entire dataframe to the calling
                    # function
                    if name[0] in ["df", "g", "group"]:
                        res = dfg.apply(v).reset_index()
                    else:
                        # Single column apply
                        res = dfg.apply(lambda g: v(g[name[0]])).reset_index()
                else:
                    # Multi-columm
                    res = dfg.apply(lambda g: v(*[g[e] for e in name])).reset_index()
            else:
                raise TypeError(
                    f"grouped dataframes cannot make direct assignments. You must pass in a str to be evaluated or a function but you passed in a type{v}"
                )

            # Calling an operation that returns df the same size as the original df,
            # like transform, e.g. 'A1 - A1.mean()'
            if res.shape[0] == prev.shape[0]:
                level_col_idx, level_col_name = [
                    (i, col)
                    for i, col in enumerate(res.columns)
                    if str(col).startswith("level_")
                ][0]

                res = res.rename(columns={res.columns[-1]: k})

                # Allow column overwriting
                if k in prev:
                    prev = prev.drop(columns=k).merge(
                        res.iloc[:, level_col_idx:],
                        left_index=True,
                        right_on=level_col_name,
                    )
                else:
                    # prev = prev.join(res[k])
                    prev = prev.merge(
                        res.iloc[:, level_col_idx:],
                        left_index=True,
                        right_on=level_col_name,
                    )
                prev = prev.drop(columns=level_col_name).reset_index(drop=True)
            else:
                # otherwise operation returns smaller
                # so we need to join on the grouping col which is the name of the first
                # col in the output
                res = res.rename(columns={res.columns[-1]: k})
                # Allow column overwriting
                if k in prev:
                    prev = prev.drop(columns=k).merge(
                        res, on=res.columns[:-1].to_list()
                    )
                else:
                    prev = prev.merge(res, on=res.columns[:-1].to_list())
        return prev
    else:
        out = dfg.copy()
        for k, v in kwargs.items():
            if isinstance(v, str):
                out = out.assign(**{k: dfg.eval(v)})
            elif callable(v):
                name = v.__code__.co_varnames
                if len(name) == 1:
                    # Normal assign where we pass in the entire dataframe to the calling
                    # function
                    if name[0] == "df":
                        out = out.assign(**{k: v})
                    else:
                        # Single column apply
                        out = out.assign(**{k: lambda df: v(df[name[0]])})
                else:
                    # Multi-columm
                    # get columns as list
                    cols = [dfg[e] for e in name]
                    out = out.assign(**{k: v(*cols)})
            else:
                # Normal assignment
                out = out.assign(**{k: v})

        return out

pivot_longer(*args, **kwargs)

Convert a list of columns into 2 columns. Can pass a list of columsn to melt-down or id_vars to select everything else: e.g. _.pivot_longer(['male', 'female'], into=('gender', 'response')) or _.pivot_longer(id_vars='SID', into=('gender','response'))

Parameters:

Name Type Description Default
columns list or None

columns to melt; Defaults to None

required
id_vars list or None

columns to use as id variables; Default to None

required
into tuple

cols to create Defaults to ("variable", "value").

required
make_index bool

does a reset_index prior to melting and adds the

required
Source code in utilz/dfverbs/verbs.py
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
@curry
def pivot_longer(*args, **kwargs):
    """
    Convert a list of columns into 2 columns. Can pass a list of columsn to melt-down or
    `id_vars` to select everything else: e.g. `_.pivot_longer(['male', 'female'],
    into=('gender', 'response'))` or `_.pivot_longer(id_vars='SID', into=('gender','response'))`

    Args:
        columns (list or None): columns to melt; Defaults to None
        id_vars (list or None): columns to use as id variables; Default to None
        into (tuple, optional): cols to create Defaults to ("variable", "value").
        make_index (bool, optional): does a reset_index prior to melting and adds the
        index col to id_vars. Defaults to False.

    """

    def call(df):
        return df.pivot_longer(*args, **kwargs)

    return call

pivot_wider(*args, **kwargs)

Convert a pair of columns to multiple columns, e.g. _.pivot_wider('condition', using='response')

Parameters:

Name Type Description Default
column str

string name of column to "explode"

required
using str

string name of column who's values should be placed into the new columns

required
drop_index bool; optional

if a 'prev_index' col exists (usually created by

required
Source code in utilz/dfverbs/verbs.py
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
@curry
def pivot_wider(*args, **kwargs):
    """
    Convert a pair of columns to multiple columns, e.g. `_.pivot_wider('condition', using='response')`

    Args:
        column (str): string name of column to "explode"
        using (str): string name of column who's values should be placed into the new columns
        drop_index (bool; optional): if a 'prev_index' col exists (usually created by
        make_index=True in pivot_longer) will drop it; Default True

    """

    def call(df):
        return df.pivot_wider(*args, **kwargs)

    return call

query(q, **kwargs)

Call a dataframe object's .query method. Resets and drops index by default. Change this with reset_index='drop'|'reset'|'none'

Source code in utilz/dfverbs/verbs.py
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
@curry
def query(q, **kwargs):
    """
    Call a dataframe object's `.query` method. Resets and drops index by
    default. Change this with `reset_index='drop'|'reset'|'none'`
    """
    reset_index = kwargs.pop("reset_index", "drop")

    def call(df):
        if isinstance(q, str):
            df = df.query(q, **kwargs)
        elif callable(q):
            name = q.__code__.co_varnames
            if len(name) == 1:
                if name[0] == "df":
                    df = df.loc[q]
                else:
                    df = df[q(df[name[0]])]
            else:
                df = df[q(*[df[e] for e in name])]

        return _reset_index_helper(df, reset_index)

    return call

read_csv(*args, **kwargs)

Call pd.read_csv

Source code in utilz/dfverbs/verbs.py
106
107
108
109
@curry
def read_csv(*args, **kwargs):
    """Call pd.read_csv"""
    return pd.read_csv(*args, **kwargs)

rename(cols, df)

Rename one ore more columns. Can either input a single tuple to rename 1 column or a dict to rename multiple

Source code in utilz/dfverbs/verbs.py
 97
 98
 99
100
101
102
103
@curry
def rename(cols, df):
    """Rename one ore more columns. Can either input a single tuple to rename 1 column
    or a dict to rename multiple"""
    if isinstance(cols, tuple):
        cols = {cols[0]: cols[1]}
    return df.rename(columns=cols)

replace(*args, **kwargs)

Call a dataframe's replace method

Source code in utilz/dfverbs/verbs.py
584
585
586
587
588
589
590
591
@curry
def replace(*args, **kwargs):
    """Call a dataframe's replace method"""

    def call(df):
        return df.replace(*args, **kwargs)

    return call

reset_index(*args, **kwargs)

Call a dataframe's reset_index method

Source code in utilz/dfverbs/verbs.py
594
595
596
597
598
599
600
601
@curry
def reset_index(*args, **kwargs):
    """Call a dataframe's reset_index method"""

    def call(df):
        return df.reset_index(*args, **kwargs)

    return call

select(*args)

Select one or more columns by name. Drop one or more columns by prepending '-' to the name. Always returns a dataframe even if there is just 1 column. Does not support renaming

Source code in utilz/dfverbs/verbs.py
425
426
427
428
429
430
431
432
433
434
435
@curry
def select(*args):
    """
    Select one or more columns by name. Drop one or more columns by prepending '-' to
    the name. **Always returns a dataframe** even if there is just 1 column. Does not support renaming
    """

    def call(df):
        return do("select", df, *args)

    return call

sort(*args, **kwargs)

Sort df by one or more columns passed as args. Ignores index by default by you can change that with ignore_index=False.

Source code in utilz/dfverbs/verbs.py
521
522
523
524
525
526
527
528
529
530
@curry
def sort(*args, **kwargs):
    """Sort df by one or more columns passed as args. Ignores index by default by you
    can change that with `ignore_index=False`."""
    ignore_index = kwargs.pop("ignore_index", True)

    def call(df):
        return df.sort_values(by=list(args), ignore_index=ignore_index, **kwargs)

    return call

split(*args, sep=' ')

Split values in single df column into multiple columns by separator, e.g. First-Last -> [First], [Last]. To split list elements use [] as the sep, e.g. [1,2,3] -> [1], [2], [3]

Parameters:

Name Type Description Default
column str

column to split

required
into list

new columns names to create

required
sep str, list

separator to split on. Use [] for list

' '
Source code in utilz/dfverbs/verbs.py
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
@curry
def split(*args, sep=" "):
    """
    Split values in single df column into multiple columns by separator, e.g.
    First-Last -> [First], [Last]. To split list elements use [] as the sep, e.g.
    [1,2,3] -> [1], [2], [3]

    Args:
        column (str): column to split
        into (list): new columns names to create
        sep (str, list): separator to split on. Use [] for list

    """

    col, into = args

    def call(df):
        if isinstance(sep, str):
            out = df[col].str.split(sep, expand=True)
        elif isinstance(sep, list):
            out = pd.DataFrame(df[col].to_list())
        if len(into) != out.shape[1]:
            raise ValueError(
                f"into has {len(into)} elements, but splitting creates a dataframe with {out.shape[1]} columns"
            )
        else:
            out.columns = list(into)

        return pd.concat([df.drop(columns=col), out], axis=1)

    return call

splitquery(query, **kwargs)

Call a dataframe or groupby object's .query method and return 2 dataframes one where containing results where the query is true and its inverse. Resets and drops index by default. Change this with reset_index='drop'|'reset'|'none'

Source code in utilz/dfverbs/verbs.py
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
@curry
def splitquery(query, **kwargs):
    """
    Call a dataframe or groupby object's `.query` method and return 2 dataframes one
    where containing results where the query is true and its inverse.
    Resets and drops index by default. Change this with `reset_index='drop'|'reset'|'none'`
    """
    reset_index = kwargs.pop("reset_index", "drop")

    def call(df):
        if isinstance(query, str):
            df_yes = df.query(query, **kwargs)
            df_no = df.query(f"not ({query})", **kwargs)
        elif callable(query):
            df_yes = df.loc[query]
            df_no = df.loc[~(query)]

        return (
            _reset_index_helper(df_yes, reset_index),
            _reset_index_helper(df_no, reset_index),
        )

    return call

squeeze(*args, **kwargs)

Call a dataframe's .squeeze method

Source code in utilz/dfverbs/verbs.py
57
58
59
60
61
62
63
64
@curry
def squeeze(*args, **kwargs):
    """Call a dataframe's `.squeeze` method"""

    def call(df):
        return df.squeeze(*args, **kwargs)

    return call

summarize(dfg, **kwargs)

Create new columns based on existing columns in a dataframe but return a smaller dataframe than the original. Works with the output of groupby as well:

Just like .mutate()/.transmute(), input should be kwargs organized like new_column = str| function. Such as: _.summarize(weight_mean ='weight.mean()') or _.summarize(weight_mean = lambda weight: weight.mean()) or _.summarize(weight_mean = lambda df: df['weight].mean()). To return output the same size as the input dataframe use .mutate() or .transmute() instead as either will broadcast values to the right size.

Source code in utilz/dfverbs/verbs.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
@curry
def summarize(dfg, **kwargs):
    """
    Create new columns based on existing columns in a dataframe but return a
    **smaller** dataframe than the original. Works with the output of `groupby` as well:

    Just like `.mutate()/.transmute()`, input should be kwargs organized like
    `new_column = str| function`. Such as: `_.summarize(weight_mean ='weight.mean()')`
    or `_.summarize(weight_mean = lambda weight: weight.mean())` or `_.summarize(weight_mean = lambda df: df['weight].mean())`. To return output the
    same size as the input dataframe use `.mutate()` or `.transmute()` instead as
    either will *broadcast* values to the right size.
    """

    if isinstance(dfg, pd.core.groupby.generic.DataFrameGroupBy):
        out = None
        for k, v in kwargs.items():
            if isinstance(v, str):
                res = dfg.apply(lambda group: group.eval(v)).reset_index()
            elif callable(v):
                name = v.__code__.co_varnames
                if len(name) == 1:
                    if name[0] in ["df", "g", "group"]:
                        res = dfg.apply(v).reset_index()
                    else:
                        # Single column summarize
                        res = dfg.apply(lambda g: v(g[name[0]])).reset_index()
                else:
                    # Multi-column summarize
                    res = dfg.apply(lambda g: v(*[g[e] for e in name])).reset_index()
            else:
                raise TypeError(
                    f"summarize expects input kwargs organized like: new_colname = str | func, but receive type: {type(v)}"
                )
            res = res.rename(columns={res.columns[-1]: k})
            if not res.shape[0] < dfg.obj.shape[0]:
                raise ValueError(
                    "functions and expressions received by summarize should return a scalar output. If you want to broadcast this value over the entire dataframe use assign() instead."
                )
            if out is None:
                out = res
            else:
                out = out.drop(columns=k, errors="ignore").merge(
                    res, on=res.columns[:-1].to_list()
                )
        return out
    elif isinstance(dfg, pd.DataFrame):
        out = dict()
        for k, v in kwargs.items():
            if isinstance(v, str):
                out[k] = dfg.eval(v)
            elif callable(v):
                name = v.__code__.co_varnames
                if len(name) == 1:
                    if name[0] == "df":
                        out[k] = v(dfg)
                    else:
                        # Single column summarize
                        out[k] = v(dfg[name[0]])
                else:
                    # multi-col summarize
                    cols = [dfg[e] for e in name]
                    out[k] = v(*cols)
            else:
                raise TypeError(
                    f"summarized expects input kwargs organized like: new_colname = str | func, but receive type: {type(v)}"
                )

        return pd.DataFrame(out, index=[0])
    else:
        raise TypeError(
            f"summarize expected previous step to be a DataFrame or GroupBy, but received a {type(dfg)}. If you used select(), you should instead select the column in the expression or function passed to summarize(new_col='old_col.mean()'). If you intended to run an expression summarize takes kwargs organized like: new_colname = str | func. This differs from agg in pandas which expects a column name and expression!"
        )

tail(*args, **kwargs)

Call dataframe's .tail() method

Source code in utilz/dfverbs/verbs.py
404
405
406
407
408
409
410
411
@curry
def tail(*args, **kwargs):
    """Call dataframe's `.tail()` method"""

    def call(df):
        return df.tail(*args, **kwargs)

    return call

to_csv(path, df, index=False)

Call a dataframe's .to_csv(index=False) method

Source code in utilz/dfverbs/verbs.py
130
131
132
133
134
135
136
@curry
def to_csv(path, df, index=False):
    """Call a dataframe's `.to_csv(index=False)` method"""
    if not str(path).endswith(".csv"):
        path = f"{path}.csv"
    df.to_csv(f"{path}", index=index)
    return df

to_list(*args, **kwargs)

Call a dataframe's .to_list method

Source code in utilz/dfverbs/verbs.py
77
78
79
80
81
82
83
84
@curry
def to_list(*args, **kwargs):
    """Call a dataframe's `.to_list` method"""

    def call(df):
        return df.to_list(*args, **kwargs)

    return call

to_numpy(*args, **kwargs)

Call a dataframe's .to_numpy method

Source code in utilz/dfverbs/verbs.py
67
68
69
70
71
72
73
74
@curry
def to_numpy(*args, **kwargs):
    """Call a dataframe's `.to_numpy` method"""

    def call(df):
        return df.to_numpy(*args, **kwargs)

    return call

transmute(dfg, **kwargs)

Just like .mutate(), but only returns the newly created columns.

Source code in utilz/dfverbs/verbs.py
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
@curry
def transmute(dfg, **kwargs):
    """Just like `.mutate()`, but only returns the newly created columns."""
    if isinstance(
        dfg,
        (
            pd.core.groupby.generic.DataFrameGroupBy,
            pd.core.groupby.generic.SeriesGroupBy,
        ),
    ):
        orig = dfg.obj
    else:
        orig = dfg
    out = mutate(dfg, **kwargs)
    cols = filter(list(orig.columns), list(out.columns), substr_match=False)
    out = out.drop(columns=cols)

    if out.shape[1] < 1:
        raise ValueError(
            "transmute does not support reassigning to an existing column. Give your new column(s) a different name(s) to extract"
        )
    else:
        return out

Stats

dataframe stats methods

abs(*args, **kwargs)

Call df.abs

Source code in utilz/dfverbs/stats.py
151
152
153
154
155
156
157
158
@curry
def abs(*args, **kwargs):
    """Call df.abs"""

    def call(df):
        return df.abs(*args, **kwargs)

    return call

all(*args, **kwargs)

Call df.all

Source code in utilz/dfverbs/stats.py
171
172
173
174
175
176
177
178
@curry
def all(*args, **kwargs):
    """Call df.all"""

    def call(df):
        return df.all(*args, **kwargs)

    return call

any(*args, **kwargs)

Call df.any

Source code in utilz/dfverbs/stats.py
181
182
183
184
185
186
187
188
@curry
def any(*args, **kwargs):
    """Call df.any"""

    def call(df):
        return df.any(*args, **kwargs)

    return call

bootci(col, **kwargs)

Calculate 95% bootstrapped confidence intervals on the mean of a column. Unlike summarize, bootci expects a string column name and will return a summary frame with columns for the mean, 2.5% and 97.% confidence limits. Use as_devation=True to convert the CIs to deviations from the mean. Accepts all the same args as seaborn.algorithms.bootstrap, e.g. units.

Source code in utilz/dfverbs/stats.py
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
@curry
def bootci(col, **kwargs):
    """Calculate 95% bootstrapped confidence intervals on the mean of a column. Unlike
    summarize, bootci expects a string column name and will return a summary frame with
    columns for the mean, 2.5% and 97.% confidence limits. Use `as_devation=True` to
    convert the CIs to deviations from the mean. Accepts all the same args as
    `seaborn.algorithms.bootstrap`, e.g. `units`."""

    deviation = kwargs.pop("as_deviation", False)

    def call(df):
        if isinstance(df, pd.core.groupby.generic.DataFrameGroupBy):
            units = kwargs.pop("units", None)

            cis = pipe(
                df,
                apply(
                    lambda g: sns.utils.ci(
                        sns.algorithms.bootstrap(
                            g[col],
                            units=g[units] if units is not None else None,
                            **kwargs,
                        )
                    ),
                    reset_index="reset",
                ),
                split(0, [f"{col}_ci_l", f"{col}_ci_u"], sep=[]),
            )
            summary = pipe(df, summarize(**{f"{col}_mean": f"{col}.mean()"}))
            matching_cols = filter(summary.columns, cis.columns)
            cis = pipe(cis, merge(summary, on=matching_cols))
            if deviation:
                cis = pipe(
                    cis,
                    mutate(
                        **{
                            f"{col}_ci_l": f"{col}_mean - {col}_ci_l",
                            f"{col}_ci_u": f"{col}_ci_u - {col}_mean",
                        },
                    ),
                )

            return cis
        else:
            raise TypeError(
                "bootci only works on grouped dataframes, trying call _.groupby before"
            )

    return call

corr(*args, **kwargs)

Call df.corr

Source code in utilz/dfverbs/stats.py
191
192
193
194
195
196
197
198
@curry
def corr(*args, **kwargs):
    """Call df.corr"""

    def call(df):
        return df.corr(*args, **kwargs)

    return call

count(*args, **kwargs)

Call df.count

Source code in utilz/dfverbs/stats.py
211
212
213
214
215
216
217
218
@curry
def count(*args, **kwargs):
    """Call df.count"""

    def call(df):
        return df.count(*args, **kwargs)

    return call

cov(*args, **kwargs)

Call df.cov

Source code in utilz/dfverbs/stats.py
201
202
203
204
205
206
207
208
@curry
def cov(*args, **kwargs):
    """Call df.cov"""

    def call(df):
        return df.cov(*args, **kwargs)

    return call

max(*args, **kwargs)

Call df.max

Source code in utilz/dfverbs/stats.py
71
72
73
74
75
76
77
78
@curry
def max(*args, **kwargs):
    """Call df.max"""

    def call(df):
        return df.max(*args, **kwargs)

    return call

mean(*args, **kwargs)

Call df.mean

Source code in utilz/dfverbs/stats.py
41
42
43
44
45
46
47
48
@curry
def mean(*args, **kwargs):
    """Call df.mean"""

    def call(df):
        return df.mean(*args, **kwargs)

    return call

median(*args, **kwargs)

Call df.median

Source code in utilz/dfverbs/stats.py
51
52
53
54
55
56
57
58
@curry
def median(*args, **kwargs):
    """Call df.median"""

    def call(df):
        return df.median(*args, **kwargs)

    return call

min(*args, **kwargs)

Call df.min

Source code in utilz/dfverbs/stats.py
61
62
63
64
65
66
67
68
@curry
def min(*args, **kwargs):
    """Call df.min"""

    def call(df):
        return df.min(*args, **kwargs)

    return call

mode(*args, **kwargs)

Call df.mode

Source code in utilz/dfverbs/stats.py
81
82
83
84
85
86
87
88
@curry
def mode(*args, **kwargs):
    """Call df.mode"""

    def call(df):
        return df.mode(*args, **kwargs)

    return call

nunique(*args, **kwargs)

Call df.nunique

Source code in utilz/dfverbs/stats.py
242
243
244
245
246
247
248
249
250
@curry
def nunique(*args, **kwargs):
    """Call df.nunique"""

    def call(df):
        out = df.nunique(*args, **kwargs)
        return out.reset_index().rename(columns={"index": "column", 0: "nunique"})

    return call

prod(*args, **kwargs)

Call df.prod

Source code in utilz/dfverbs/stats.py
131
132
133
134
135
136
137
138
@curry
def prod(*args, **kwargs):
    """Call df.prod"""

    def call(df):
        return df.prod(*args, **kwargs)

    return call

rank(*args, **kwargs)

Call df.rank

Source code in utilz/dfverbs/stats.py
264
265
266
267
268
269
270
271
@curry
def rank(*args, **kwargs):
    """Call df.rank"""

    def call(df):
        return df.rank(*args, **kwargs)

    return call

round(*args, **kwargs)

Call df.round

Source code in utilz/dfverbs/stats.py
141
142
143
144
145
146
147
148
@curry
def round(*args, **kwargs):
    """Call df.round"""

    def call(df):
        return df.round(*args, **kwargs)

    return call

sem(*args, **kwargs)

Call df.sem

Source code in utilz/dfverbs/stats.py
121
122
123
124
125
126
127
128
@curry
def sem(*args, **kwargs):
    """Call df.sem"""

    def call(df):
        return df.sem(*args, **kwargs)

    return call

size(*args, **kwargs)

Call df.size

Source code in utilz/dfverbs/stats.py
274
275
276
277
278
279
280
281
@curry
def size(*args, **kwargs):
    """Call df.size"""

    def call(df):
        return df.size(*args, **kwargs)

    return call

sqrt(*args, **kwargs)

Call df.sqrt

Source code in utilz/dfverbs/stats.py
161
162
163
164
165
166
167
168
@curry
def sqrt(*args, **kwargs):
    """Call df.sqrt"""

    def call(df):
        return df.sqrt(*args, **kwargs)

    return call

std(*args, **kwargs)

Call df.std

Source code in utilz/dfverbs/stats.py
101
102
103
104
105
106
107
108
@curry
def std(*args, **kwargs):
    """Call df.std"""

    def call(df):
        return df.std(*args, **kwargs)

    return call

sum(*args, **kwargs)

Call df.sum

Source code in utilz/dfverbs/stats.py
111
112
113
114
115
116
117
118
@curry
def sum(*args, **kwargs):
    """Call df.sum"""

    def call(df):
        return df.sum(*args, **kwargs)

    return call

unique(*args, **kwargs)

Call df.unique

Source code in utilz/dfverbs/stats.py
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
@curry
def unique(*args, **kwargs):
    """Call df.unique"""

    def call(df):
        # Unique only exists on series, so squeeze down single col df or loop over cols
        if df.shape[1] == 1:
            out = df.squeeze().unique(*args, **kwargs)
        else:
            out = df.apply(lambda col: col.unique(*args, **kwargs))

        # just one col
        if isinstance(out, np.ndarray):
            out = pd.DataFrame(out, columns=["unique"])
            out["column"] = df.columns[0]
            return out[["column", "unique"]]
        return out.reset_index().rename(columns={"index": "column", 0: "unique"})

    return call

value_counts(*args, **kwargs)

Call df.value_counts

Source code in utilz/dfverbs/stats.py
253
254
255
256
257
258
259
260
261
@curry
def value_counts(*args, **kwargs):
    """Call df.value_counts"""

    def call(df):
        out = df.value_counts(*args, **kwargs)
        return out.reset_index().rename(columns={"index": "column", 0: "count"})

    return call

var(*args, **kwargs)

Call df.var

Source code in utilz/dfverbs/stats.py
91
92
93
94
95
96
97
98
@curry
def var(*args, **kwargs):
    """Call df.var"""

    def call(df):
        return df.var(*args, **kwargs)

    return call

Plots

plotting verbs to wrap calls to seaborn

barplot(**kwargs)

Call to seaborn barplot

Source code in utilz/dfverbs/plot.py
269
270
271
272
273
274
275
276
@curry
def barplot(**kwargs):
    """Call to seaborn barplot"""

    def plot(data):
        return sns.barplot(data=data, ax=newax(), **kwargs)

    return plot

boxenplot(**kwargs)

Call to seaborn boxenplot

Source code in utilz/dfverbs/plot.py
119
120
121
122
123
124
125
126
@curry
def boxenplot(**kwargs):
    """Call to seaborn boxenplot"""

    def plot(data):
        return sns.boxenplot(data=data, ax=newax(), **kwargs)

    return plot

boxplot(**kwargs)

Call to seaborn boxplot

Source code in utilz/dfverbs/plot.py
139
140
141
142
143
144
145
146
@curry
def boxplot(**kwargs):
    """Call to seaborn boxplot"""

    def plot(data):
        return sns.boxplot(data=data, ax=newax(), **kwargs)

    return plot

catplot(**kwargs)

Call to seaborn catplot

Source code in utilz/dfverbs/plot.py
259
260
261
262
263
264
265
266
@curry
def catplot(**kwargs):
    """Call to seaborn catplot"""

    def plot(data):
        return sns.catplot(data=data, **kwargs)

    return plot

clustermap(**kwargs)

Call to seaborn clustermap

Source code in utilz/dfverbs/plot.py
59
60
61
62
63
64
65
66
@curry
def clustermap(**kwargs):
    """Call to seaborn clustermap"""

    def plot(data):
        return sns.clustermap(data=data, **kwargs)

    return plot

countplot(**kwargs)

Call to seaborn countplot

Source code in utilz/dfverbs/plot.py
 99
100
101
102
103
104
105
106
@curry
def countplot(**kwargs):
    """Call to seaborn countplot"""

    def plot(data):
        return sns.countplot(data=data, ax=newax(), **kwargs)

    return plot

displot(**kwargs)

Call to seaborn displot

Source code in utilz/dfverbs/plot.py
209
210
211
212
213
214
215
216
@curry
def displot(**kwargs):
    """Call to seaborn displot"""

    def plot(data):
        return sns.displot(data=data, **kwargs)

    return plot

ecdfplot(**kwargs)

Call to seaborn ecdfplot

Source code in utilz/dfverbs/plot.py
179
180
181
182
183
184
185
186
@curry
def ecdfplot(**kwargs):
    """Call to seaborn ecdfplot"""

    def plot(data):
        return sns.ecdfplot(data=data, ax=newax(), **kwargs)

    return plot

heatmap(**kwargs)

Call to seaborn heatmap

Source code in utilz/dfverbs/plot.py
239
240
241
242
243
244
245
246
@curry
def heatmap(**kwargs):
    """Call to seaborn heatmap"""

    def plot(data):
        return sns.heatmap(data=data, ax=newax(), **kwargs)

    return plot

histplot(**kwargs)

Call to seaborn histplot

Source code in utilz/dfverbs/plot.py
199
200
201
202
203
204
205
206
@curry
def histplot(**kwargs):
    """Call to seaborn histplot"""

    def plot(data):
        return sns.histplot(data=data, ax=newax(), **kwargs)

    return plot

jointplot(**kwargs)

Call to seaborn jointplot

Source code in utilz/dfverbs/plot.py
39
40
41
42
43
44
45
46
@curry
def jointplot(**kwargs):
    """Call to seaborn jointplot"""

    def plot(data):
        return sns.jointplot(data=data, ax=newax(), **kwargs)

    return plot

kdeplot(**kwargs)

Call to seaborn kdeplot

Source code in utilz/dfverbs/plot.py
189
190
191
192
193
194
195
196
@curry
def kdeplot(**kwargs):
    """Call to seaborn kdeplot"""

    def plot(data):
        return sns.kdeplot(data=data, ax=newax(), **kwargs)

    return plot

lineplot(**kwargs)

Call to seaborn lineplot

Source code in utilz/dfverbs/plot.py
249
250
251
252
253
254
255
256
@curry
def lineplot(**kwargs):
    """Call to seaborn lineplot"""

    def plot(data):
        return sns.lineplot(data=data, ax=newax(), **kwargs)

    return plot

lmplot(**kwargs)

Call to seaborn lmplot

Source code in utilz/dfverbs/plot.py
89
90
91
92
93
94
95
96
@curry
def lmplot(**kwargs):
    """Call to seaborn lmplot"""

    def plot(data):
        return sns.lmplot(data=data, **kwargs)

    return plot

pairplot(**kwargs)

Call to seaborn pairplot

Source code in utilz/dfverbs/plot.py
49
50
51
52
53
54
55
56
@curry
def pairplot(**kwargs):
    """Call to seaborn pairplot"""

    def plot(data):
        return sns.pairplot(data=data, **kwargs)

    return plot

plot(*args, **kwargs)

Call a dataframe's .plot method

Source code in utilz/dfverbs/plot.py
290
291
292
293
294
295
296
297
@curry
def plot(*args, **kwargs):
    """Call a dataframe's .plot method"""

    def call(df):
        return df.plot(*args, **kwargs)

    return call

pointplot(**kwargs)

Call to seaborn pointplot

Source code in utilz/dfverbs/plot.py
109
110
111
112
113
114
115
116
@curry
def pointplot(**kwargs):
    """Call to seaborn pointplot"""

    def plot(data):
        return sns.pointplot(data=data, ax=newax(), **kwargs)

    return plot

regplot(**kwargs)

Call to seaborn regplot

Source code in utilz/dfverbs/plot.py
79
80
81
82
83
84
85
86
@curry
def regplot(**kwargs):
    """Call to seaborn regplot"""

    def plot(data):
        return sns.regplot(data=data, ax=newax(), **kwargs)

    return plot

relplot(**kwargs)

Call to seaborn relplot

Source code in utilz/dfverbs/plot.py
229
230
231
232
233
234
235
236
@curry
def relplot(**kwargs):
    """Call to seaborn relplot"""

    def plot(data):
        return sns.relplot(data=data, **kwargs)

    return plot

residplot(**kwargs)

Call to seaborn residplot

Source code in utilz/dfverbs/plot.py
69
70
71
72
73
74
75
76
@curry
def residplot(**kwargs):
    """Call to seaborn residplot"""

    def plot(data):
        return sns.residplot(data=data, ax=newax(), **kwargs)

    return plot

rugplot(**kwargs)

Call to seaborn rugplot

Source code in utilz/dfverbs/plot.py
169
170
171
172
173
174
175
176
@curry
def rugplot(**kwargs):
    """Call to seaborn rugplot"""

    def plot(data):
        return sns.rugplot(data=data, ax=newax(), **kwargs)

    return plot

scatterplot(**kwargs)

Call to seaborn scatterplot

Source code in utilz/dfverbs/plot.py
219
220
221
222
223
224
225
226
@curry
def scatterplot(**kwargs):
    """Call to seaborn scatterplot"""

    def plot(data):
        return sns.scatterplot(data=data, ax=newax(), **kwargs)

    return plot

stripbarplot(**kwargs)

Call to combined stripplot and barplot. See utilz.plot.stripbarplot

Source code in utilz/dfverbs/plot.py
279
280
281
282
283
284
285
286
287
@curry
def stripbarplot(**kwargs):
    """Call to combined stripplot and barplot. See utilz.plot.stripbarplot"""

    def plot(data):
        ax = kwargs.pop("ax", "newax")
        return _stripbarplot(data=data, ax=ax, **kwargs)

    return plot

stripplot(**kwargs)

Call to seaborn stripplot

Source code in utilz/dfverbs/plot.py
159
160
161
162
163
164
165
166
@curry
def stripplot(**kwargs):
    """Call to seaborn stripplot"""

    def plot(data):
        return sns.stripplot(data=data, ax=newax(), **kwargs)

    return plot

swarmplot(**kwargs)

Call to seaborn swarmplot

Source code in utilz/dfverbs/plot.py
149
150
151
152
153
154
155
156
@curry
def swarmplot(**kwargs):
    """Call to seaborn swarmplot"""

    def plot(data):
        return sns.swarmplot(data=data, ax=newax(), **kwargs)

    return plot

violinplot(**kwargs)

Call to seaborn violinplot

Source code in utilz/dfverbs/plot.py
129
130
131
132
133
134
135
136
@curry
def violinplot(**kwargs):
    """Call to seaborn violinplot"""

    def plot(data):
        return sns.violinplot(data=data, ax=newax(), **kwargs)

    return plot

utilz.dftools

Common data operations and transformations often on pandas dataframes. This creates new dataframe methods that can be called like this:

df.norm_by_group(grpcol='Class', valcol='Score')


assert_balanced_groups(df, grpcols, size=None)

Check if each group of grpcols has the same dimensions

Parameters:

Name Type Description Default
df pd.DataFrame

input dataframe

required
group_cols str/List

column names to group on in dataframe

required
shape tuple/None

optional group sizes to ensure

required
Source code in utilz/dftools.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
@_register_dataframe_method
def assert_balanced_groups(df, grpcols: Union[str, List], size=None):
    """
    Check if each group of `grpcols` has the same dimensions

    Args:
        df (pd.DataFrame): input dataframe
        group_cols (str/List): column names to group on in dataframe
        shape (tuple/None, optional): optional group sizes to ensure
    """

    grouped = df.groupby(grpcols).size()
    size = grouped[0] if size is None else size
    if not np.all(grouped == size):
        raise AssertionError(f"Group sizes don't match!\n{grouped}")
    else:
        return True

assert_same_nunique(df, grpcols, valcol, size=None)

Check if each group has the same number of unique values in valcol

Parameters:

Name Type Description Default
df pd.DataFrame

input dataframe

required
valcol str

column to check unique values in

required
grpcols str/list

column names to group on in dataframe, Default None

required
shape tuple/None

optional sizes to ensure

required
Source code in utilz/dftools.py
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
@_register_dataframe_method
def assert_same_nunique(df, grpcols: Union[str, List], valcol: str, size=None):
    """
    Check if each group has the same number of unique values in `valcol`

    Args:
        df (pd.DataFrame): input dataframe
        valcol (str): column to check unique values in
        grpcols (str/list): column names to group on in dataframe, Default None
        shape (tuple/None, optional): optional sizes to ensure
    """

    grouped = df.groupby(grpcols)[valcol].nunique()
    size = grouped[0] if size is None else size
    if not np.all(grouped == size):
        raise AssertionError(f"Groups don't have same nunique values!\n{grouped}")
    else:
        return True

norm_by_group(df, grpcol, valcols, center=True, scale=True, addcol=True)

Normalize values in one or more columns separately per group

Parameters:

Name Type Description Default
df pd.DataFrame

input dataframe

required
grpcols str

grouping col

required
valcols Union[str, List]

value cols

required
center bool

mean center. Defaults to True.

True
scale bool

divide by standard deviation. Defaults to True.

True
Source code in utilz/dftools.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
@_register_dataframe_method
def norm_by_group(df, grpcol, valcols, center=True, scale=True, addcol=True):
    """
    Normalize values in one or more columns separately per group

    Args:
        df (pd.DataFrame): input dataframe
        grpcols (str): grouping col
        valcols (Union[str, List]): value cols
        center (bool, optional): mean center. Defaults to True.
        scale (bool, optional): divide by standard deviation. Defaults to True.
    """

    def _norm(dat, center, scale):
        if center:
            dat = dat - dat.mean()
        if scale:
            dat = dat / dat.std()
        return dat

    if isinstance(grpcol, List):
        raise NotImplementedError("Grouping by multiple columns is not supported")

    if not isinstance(valcols, List):
        valcols = [valcols]

    out = df.groupby(grpcol)[valcols].transform(_norm, center, scale)

    if addcol:
        if center and not scale:
            idx = "centered"
        elif scale and not center:
            idx = "scaled"
        elif center and scale:
            idx = "normed"

        out = out.to_dict()
        assign_dict = {}
        for key in out.keys():
            assign_dict[f"{key}_{idx}_by_{grpcol}"] = out[key]
        out = df.assign(**assign_dict)
    return out.squeeze()

pivot_longer(df, columns=None, id_vars=None, into=('variable', 'value'), make_index=False)

Take multiple columns or multiple id_vars and melt them into 2 columns. If columns is provided, id_vars is inferred and visa-versa. If make_index=True, will use the current index as a new id_var to ensure a unique index.

Parameters:

Name Type Description Default
df pd.DataFrame

input DataFrame

required
columns list or None

columns to melt; Defaults to None

None
id_vars list or None

columns to use as id variables; Default to None

None
into tuple

cols to create Defaults to ("variable", "value").

('variable', 'value')
make_index bool

does a reset_index prior to melting and adds the

False
Source code in utilz/dftools.py
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
@_register_dataframe_method
def pivot_longer(
    df, columns=None, id_vars=None, into=("variable", "value"), make_index=False
):
    """
    Take multiple columns or multiple id_vars and melt them into 2 columns. If columns
    is provided, id_vars is inferred and visa-versa. If make_index=True, will use the
    current index as a new id_var to ensure a unique index.

    Args:
        df (pd.DataFrame): input DataFrame
        columns (list or None): columns to melt; Defaults to None
        id_vars (list or None): columns to use as id variables; Default to None
        into (tuple, optional): cols to create Defaults to ("variable", "value").
        make_index (bool, optional): does a reset_index prior to melting and adds the
        index col to id_vars. Defaults to False.

    """
    # User provide list of columns to gather -> like in R
    if columns is not None:
        # Grab remaining columns if id_vars isn't provided
        if id_vars is None:
            id_vars = [col for col in df.columns if col not in columns]
    else:
        if id_vars is not None:
            columns = [col for col in df.columns if col not in id_vars]

    if make_index:
        df = df.reset_index().rename(columns={"index": "prev_index"})
        if id_vars is None:
            id_vars = "prev_index"
        else:
            id_vars = list(id_vars) + ["prev_index"]

    df = df.melt(
        id_vars=id_vars,
        value_vars=columns,
        var_name=into[0],
        value_name=into[1],
    )
    return df

pivot_wider(df, column, using, drop_index=True)

Cast a column of long-form tidy data to a set of wide columns based on the values in a another column ('using')

Parameters:

Name Type Description Default
df pd.DataFrame

input dataframe

required
column str

string name of column to "explode"

required
using str

string name of column who's values should be placed into the new

required
drop_index bool; optional

if a 'prev_index' col exists (usually created by

True
Source code in utilz/dftools.py
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
@_register_dataframe_method
def pivot_wider(df, column, using, drop_index=True):
    """
    Cast a column of long-form tidy data to a set of wide columns based on the values in
    a another column ('using')

    Args:
        df (pd.DataFrame): input dataframe
        column (str): string name of column to "explode"
        using (str): string name of column who's values should be placed into the new
        columns
        drop_index (bool; optional): if a 'prev_index' col exists (usually created by
        make_index=True in pivot_longer) will drop it; Default True

    """
    index = [col for col in df.columns if col not in [column, using]]
    try:
        out = df.pivot(
            index=index,
            columns=column,
            values=using,
        ).reset_index()
        if drop_index:
            out = out.drop(columns=["prev_index"], errors="ignore")
        return out
    except ValueError as e:
        if "duplicate" in str(e):
            print(
                f"ERROR: It's not possible to infer what rows are unique from columns that make up the index: {index}. If you have multiple observations per index, then you should use .pivot_table and decide how to *aggregate* these observations. Otherwise .pivot_longer() can create a unique index for with make_index = True"
            )
        raise e

select(df, *args, **kwargs)

Select one ore more columns by name. Drop one or more columns by prepending '-' to the name. Rename columns using keyword arguments.

Examples:

>>> # Grab 2 columns
>>> df.select('sepal_width', 'petal_width')
>>> # Get all columns except one
>>> df.select('-sepal_width')
>>> # Grab a column and rename it
>>> df.select(sepal_width='width')
Source code in utilz/dftools.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
@_register_dataframe_method
def select(df, *args, **kwargs):
    """
    Select one ore more columns by name. Drop one or more columns by prepending '-' to
    the name. Rename columns using keyword arguments.

    Examples:

        >>> # Grab 2 columns
        >>> df.select('sepal_width', 'petal_width')


        >>> # Get all columns except one
        >>> df.select('-sepal_width')

        >>> # Grab a column and rename it
        >>> df.select(sepal_width='width')

    """
    # "Select as" functionality; get col and rename
    if kwargs:
        if args:
            raise ValueError(
                "mixing arguments and keyword arguments is not supported. If you want to filter columns and rename them, you should instead chain multiple calls to .select. For example: df.select('-sepal_length').select(petal_width='width', species='flower')"
            )
        cols = list(kwargs.keys())
        return df.filter(items=cols, axis="columns").rename(columns=kwargs)

    # Get col via name or exclude -name
    col_list = [*args]
    # Split columns to keep and drop based on '-' prefix
    drop, keep = filter("-", col_list, invert="split", assert_notempty=False)
    # Remove the prefix
    if len(drop):
        drop = mapcat(lambda col: col[1:], drop)
    if len(keep):
        return df.drop(columns=drop).filter(items=keep, axis="columns")
    return df.drop(columns=drop)