Skip to content

IO (input-output functions)

utilz.io

I/O Module

crawl(where='.', ignore_git=True, respect_gitignore=True, ignore=None)

Crawls a folder and returns a list of Path objects containing folders and files while respecting gitignore files if present an any additional ignore file names or patterns

Parameters:

Name Type Description Default
where Union[str, Path]

location to glob in. Defaults to ".".

'.'
ignore_git bool

ignore .git folder. Defaults to True.

True
respect_gitignore bool

read and ignore all files and patterns in

True
ignore Union[None, list, str]

additional files or patterns to ignore. Defaults to None.

None

Returns:

Name Type Description
List List

description

Source code in utilz/io.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
def crawl(
    where: Union[str, Path] = ".",
    ignore_git: bool = True,
    respect_gitignore: bool = True,
    ignore: Union[None, list, str] = None,
) -> List:
    """
    Crawls a folder and returns a list of Path objects containing folders and files
    while respecting gitignore files if present an any additional ignore file names or patterns

    Args:
        where (Union[str, Path], optional): location to glob in. Defaults to ".".
        ignore_git (bool, optional): ignore `.git` folder. Defaults to True.
        respect_gitignore (bool, optional): read and ignore all files and patterns in
        `.gitignore` file. Defaults to True.
        ignore (Union[None, list, str], optional): additional files or patterns to ignore. Defaults to None.

    Returns:
        List: _description_
    """

    if ignore is None:
        ignore = []
    elif isinstance(ignore, str):
        ignore = [ignore]
    ignore_list = [".git"] if ignore_git else []
    if respect_gitignore:
        ignore_list += mapcat(lambda s: s.strip("\n"), load(".gitignore"))

    ignore_list += ignore
    # Split glob patterns and regular file/folder name matches
    globs, nonglobs = filter("*", ignore_list, invert="split")

    # Generator for all files
    out = Path(where)
    files = out.rglob("*")

    # Filter out folders and file names from gitignore
    # Filter out glob patters from gitignore
    return pipe(
        files,
        filter(nonglobs, invert=True),
        filter(lambda f: any(fnmatchcase(str(f), g) for g in globs), invert=True),
        sort,
    )

load(f, as_arr=False, as_str=False, verbose=False, glob='*', glob_sort=True, assert_notempty=True, loader_func=None, **kwargs)

A handy dandy all-in-one loading function. Simply pass a Path object to a file or directory and you'll back a python object or list of objects based on the file-extension:

  • .csv: pd.Dataframe
  • .p/.pickle: output of pickle.load
  • .json: str or dict
  • .npy: np.ndarray
  • .txt: np.ndarray, list[str] (lines a file), or str (all file contents)
  • other file-extensions are attempted to be loaded like .txt files
  • if give a directory all files matching glob in that directory will be loaded

Parameters:

Name Type Description Default
f Path/str

name or path object to load

required
as_arr bool

treat a .txt file as a numpy array;

False
as_str bool

open txt/json as a single string instead of

False
assert_notempty(bool, optional

make sure the output is not Falsey (e.g. empty

required
verbose bool

whether to print messages during load. Default False

False
**kwargs

keyword arguments to pd.read_csv or np.loadtxt

{}
glob string

globbing pattern if f is a directory. Defaults to all files

'*'
glob_sort bool

sort the globa before loadin. Defaults to True

True
assert_notempty bool

raise an error if the returned output is

True
loader_func callable

a custom function to use for loading; Default None, uses file extension

None

Returns:

Type Description
Any

the loaded object or list of objects

Source code in utilz/io.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
def load(
    f: Union[Path, str],
    as_arr: bool = False,
    as_str: bool = False,
    verbose: bool = False,
    glob: str = "*",
    glob_sort: bool = True,
    assert_notempty: bool = True,
    loader_func: Union[Callable, None] = None,
    **kwargs,
) -> Any:
    """
    A handy dandy all-in-one loading function. Simply pass a Path object to a file or directory and you'll back a python object or list of objects based on the file-extension:

    - `.csv`: `pd.Dataframe`
    - `.p/.pickle`: output of `pickle.load`
    - `.json`: `str` or `dict`
    - `.npy`: `np.ndarray`
    - `.txt`: `np.ndarray`, `list[str]` (lines a file), or `str` (all file contents)
    - other file-extensions are attempted to be loaded like `.txt` files
    - if give a directory all files matching `glob` in that directory will be loaded

    Args:
        f (Path/str): name or path object to load
        as_arr (bool, optional): treat a .txt file as a numpy array;
        Default False
        as_str (bool, optional): open txt/json as a single string instead of
        splitting on newlines; Default False
        assert_notempty(bool, optional): make sure the output is not Falsey (e.g. empty
        array, dataframe, string, list); Default True
        verbose (bool, optional): whether to print messages during load. Default False
        **kwargs: keyword arguments to `pd.read_csv` or `np.loadtxt`
        glob (string, optional): globbing pattern if f is a directory. Defaults to all files
        glob_sort (bool, optional): sort the globa before loadin. Defaults to True
        assert_notempty (bool, optional): raise an error if the returned output is
        empty; Default True
        loader_func (callable, optional): a custom function to use for loading; Default None, uses file extension

    Returns:
        the loaded object or list of objects
    """

    if isinstance(f, str):
        f = Path(f)
    if not isinstance(f, Path):
        raise TypeError("Input must be a string or Path object")

    if f.is_dir():
        out = list(f.glob(glob))
        out = sorted(out) if glob_sort else out
        # Recursively call load on each file in dir and forward args
        out = [
            load(
                o,
                as_arr=as_arr,
                as_str=as_str,
                verbose=verbose,
                loader_func=loader_func,
                assert_notempty=assert_notempty,
                **kwargs,
            )
            for o in out
        ]

    elif loader_func is not None:
        if verbose:
            print("Using provided custom load function")
        out = loader_func(str(f))

    elif f.suffix == ".npy":
        if verbose:
            print("npy file - using numpy")
        out = np.load(str(f), **kwargs)

    elif f.suffix == ".csv":
        if verbose:
            print("csv file - using pandas")
        out = pd.read_csv(str(f), **kwargs)

    elif f.suffix == ".p" or f.suffix == ".pickle":
        if verbose:
            print("pickle file - using pickle")
        with f.open(mode="rb") as file_handle:
            out = pickle.load(file_handle)

    elif f.suffix == ".json":
        if verbose:
            print("json file - using pickle")
        with f.open() as file_handle:
            if as_str:
                out = json.loads(file_handle.read())
            else:
                out = json.load(file_handle)

    else:
        if verbose and f.suffix != ".txt":
            warn(f"{f.suffix} not supported treating as .txt file...")

        if as_arr:
            if verbose:
                print("txt file - using numpy")
            out = np.loadtxt(str(f), **kwargs)
        else:
            if verbose:
                print("txt file using - open")
            with f.open() as file_handle:
                if as_str:
                    out = file_handle.read()
                else:
                    out = file_handle.readlines()

    if assert_notempty and len(out) == 0:
        raise AssertionError("Loaded data is empty!")

    return out