Source code for pymer4.simulate

from __future__ import division
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform
from pymer4.utils import isPSD, nearestPSD
from pymer4.stats import discrete_inverse_logit

__all__ = ["easy_multivariate_normal", "simulate_lm", "simulate_lmm"]

__author__ = ["Eshin Jolly"]
__license__ = "MIT"


[docs]def simulate_lm(
    num_obs,
    num_coef,
    coef_vals=None,
    corrs=None,
    mus=0.0,
    sigmas=1.0,
    noise_params=(0, 1),
    family="gaussian",
    seed=None,
):
    """
    Function to quickly simulate a regression model dataset, with continuous predictors.
    Provided a number of observations, number of coefficients, and optionally correlations between predictors, means, and standard deviations of predictors, returns a pandas dataframe with simulated data that can be used to estimate a linear regression using Lm(). Using the family='binomial' argument can generate discrete dependent variable values for use with logistic regression.

    Defaults to returning standard normal (mu = 0; sigma = 1) predictors with no explicit correlations.

    Args:
        num_obs (int): number of total observations, i.e. rows of data
        num_coef (int): number of coefficients/regressors, i.e. columns of data
        coef_vals (list,optional): "true" values of coefficients to generate data. If not provided will be randomly generated. Must include a coefficient for the intercept as well (i.e. mean of data)
        corrs (ndarray,list,float): correlations between coefficients provided as 2d num_coef x num_coef, 1d flattend numpy array/list of length (num_features * (num_features-1)) / 2, or a float to be treated as the same correlation between all coefficients
        mus (float/list/ndarray): means of columns of predictors
        sigmas (float/list/ndarray): stds of columns of predictors
        noise_params (tup, optional): mean and std of noise added to simulated data
        family (str): distribution family for the dependent variable. Currently only 'gaussian' (continuous DV) or 'binomial' (discrete DV) are available.
        seed (int): seed for reproducible random number generation

    Returns:
        Multiple:

            - **data** (*pd.DataFrame*): dataframe organized as num_obs x num_coef

            - **coefs** (*np.array*): ground-truth coefficient values

    """

    if seed is not None:
        np.random.seed(seed)

    if coef_vals is not None:
        if len(coef_vals) - num_coef == 0:
            raise ValueError(
                "Missing one coefficient value. Did you provide a value for the intercept term?"
            )
        else:
            assert (
                len(coef_vals) == num_coef + 1
            ), "Number of coefficient values should be num_coef + 1 (for intercept)"

        b = coef_vals
    else:
        b = np.random.rand(num_coef + 1)

    if isinstance(mus, list) or isinstance(mus, np.ndarray):
        assert len(mus) == len(b) - 1, "mus must match number of num_coef"
    if isinstance(sigmas, list) or isinstance(sigmas, np.ndarray):
        assert len(sigmas) == len(b) - 1, "sigmas must match number of num_coef"
    assert (
        isinstance(noise_params, tuple) and len(noise_params) == 2
    ), "noise_params should be a tuple of (mean,std)"

    # Generate random design matrix
    if corrs is not None:
        X = easy_multivariate_normal(num_obs, num_coef, corrs, mus, sigmas, seed)
    else:
        X = np.random.normal(mus, sigmas, size=(num_obs, num_coef))
    # Add intercept
    X = np.column_stack([np.ones((num_obs, 1)), X])
    # Generate data
    Y = np.dot(X, b) + np.random.normal(*noise_params, size=num_obs)
    # Apply transform if not linear model
    if family == "binomial":
        Y = discrete_inverse_logit(Y)
    dat = pd.DataFrame(
        np.column_stack([Y, X[:, 1:]]),
        columns=["DV"] + ["IV" + str(elem + 1) for elem in range(X.shape[1] - 1)],
    )

    return dat, b


[docs]def simulate_lmm(
    num_obs,
    num_coef,
    num_grps,
    coef_vals=None,
    corrs=None,
    grp_sigmas=0.25,
    mus=0.0,
    sigmas=1.0,
    noise_params=(0, 1),
    family="gaussian",
    seed=None,
):
    """
    Function to quickly simulate a multi-level regression model dataset, with continuous predictors.
    Provided a number of observations, number of coefficients, number of groups/clusters,
    and optionally correlations between predictors, means, and standard deviations of predictors,
    returns a pandas dataframe with simulated data that can be used to estimate a multi-level model using Lmer(). Using the family='binomial' argument can generate discrete dependent variable values for use with logistic multi-level models.

    Defaults to returning standard normal (mu = 0; sigma = 1) predictors with no explicit correlations and low variance between
    groups (sigma = .25).

    Args:
        num_obs (int): number of observations per cluster/stratum/group
        num_coef (int): number of coefficients/regressors, i.e. columns of data
        num_grps (int): number of cluster/stratums/groups
        coef_vals (list,optional): "true" values of coefficients to generate data. If not provided will be randomly generated. Must include a coefficient for the intercept as well (i.e. mean of data)
        corrs (ndarray,list,float): correlations between coefficients provided as 2d num_coef x num_coef, 1d flattend numpy array/list of length (num_features * (num_features-1)) / 2, or a float to be treated
        as the same correlation between all coefficients
        grp_sigmas (int or list): grp level std around population coefficient values; can be a single value in which case same std is applied around all coefficients or a list for different std; default .25
        mus (float/list/ndarray): means of columns of predictors
        sigmas (float/list/ndarray): stds of columns of predictors
        noise_params (tup, optional): mean and std of noise added to each group's simulated data
        family (str): distribution family for the dependent variable. Currently only 'gaussian' (continuous DV) or 'binomial' (discrete DV) are available.
        seed (int): seed for reproducible random number generation

    Returns:
        Multiple:

            - **data** (*pd.DataFrame*): dataframe organized as num_obs x num_coef

            - **blups** (*pd.DataFrame*): ground-truth group/cluster level coefficients, organized as num_grps x num_coef (i.e. BLUPs)

            - **coefs** (*np.array*): ground-truth population-level coefficients

    """

    if seed is not None:
        np.random.seed(seed)

    if coef_vals:
        if len(coef_vals) - num_coef == -1:
            raise ValueError(
                "Missing one coefficient value. Did you provide a value for the intercept term?"
            )
        else:
            assert len(coef_vals) == num_coef + 1

        b = coef_vals
    else:
        b = np.random.rand(num_coef + 1)

    assert (
        isinstance(noise_params, tuple) and len(noise_params) == 2
    ), "noise_params should be a tuple of (mean,std)"
    assert (
        isinstance(grp_sigmas, int)
        or isinstance(grp_sigmas, list)
        or isinstance(grp_sigmas, float)
    ), "grp_sigmas should be scalar value or list"
    if not isinstance(grp_sigmas, list):
        grp_sigmas = [grp_sigmas] * (num_coef + 1)
    else:
        assert len(grp_sigmas) == len(
            b
        ), "The length of a list of grp_sigmas must match the num_coef plus intercept!"

    if isinstance(mus, list) or isinstance(mus, np.ndarray):
        assert len(mus) == len(b) - 1, "mus must match number of num_coef"
    if isinstance(sigmas, list) or isinstance(sigmas, np.ndarray):
        assert len(sigmas) == len(b) - 1, "sigmas must match number of num_coef"

    # Generate group paramaters (BLUPs)
    blups = np.array(
        [np.random.normal(est, sigma, num_grps) for est, sigma in zip(b, grp_sigmas)]
    ).T

    # Generate data
    for grp in range(blups.shape[0]):
        # Create a random design matrix per group
        if corrs:
            x = easy_multivariate_normal(num_obs, num_coef, corrs, mus, sigmas, seed)
        else:
            x = np.random.normal(mus, sigmas, size=(num_obs, num_coef))
        x = np.column_stack([np.ones((num_obs, 1)), x])
        # Use blups to generate group data
        y = np.dot(x, blups[grp, :]) + np.random.normal(*noise_params, size=num_obs)
        if family == "binomial":
            y = discrete_inverse_logit(y)
        if grp == 0:
            x_all, y_all = x, y
        else:
            y_all = np.append(y_all, y, axis=0)
            x_all = np.append(x_all, x, axis=0)

    grp_ids = np.array([[elem] * num_obs for elem in range(1, num_grps + 1)]).ravel()

    data = pd.DataFrame(
        np.column_stack([y_all, x_all[:, 1:], grp_ids]),
        columns=["DV"]
        + ["IV" + str(elem + 1) for elem in range(x_all.shape[1] - 1)]
        + ["Group"],
    )
    blups = pd.DataFrame(
        blups,
        columns=["Intercept"]
        + ["IV" + str(elem + 1) for elem in range(x_all.shape[1] - 1)],
        index=["Grp" + str(elem + 1) for elem in range(num_grps)],
    )
    return data, blups, b


[docs]def easy_multivariate_normal(
    num_obs,
    num_features,
    corrs,
    mu=0.0,
    sigma=1.0,
    seed=None,
    forcePSD=True,
    return_new_corrs=False,
    nit=100,
):
    """
    Function to more easily generate multivariate normal samples provided a correlation matrix or list of correlations (upper triangle of correlation matrix) instead of a covariance matrix. Defaults to returning approximately standard normal (mu = 0; sigma = 1) variates. Unlike numpy, if the desired correlation matrix is not positive-semi-definite, will by default issue a warning and find the nearest PSD correlation matrix and generate data with this matrix. This new matrix can optionally be returned used the return_new_corrs argument.

    Args:
        num_obs (int): number of observations/samples to generate (rows)
        corrs (ndarray/list/float): num_features x num_features 2d array, flattend numpy array of length (num_features * (num_features-1)) / 2, or scalar for same correlation on all off-diagonals
        num_features (int): number of features/variables/dimensions to generate (columns)
        mu (float/list): mean of each feature across observations; default 0.0
        sigma (float/list): sd of each feature across observations; default 1.0
        forcePD (bool): whether to find and use a new correlation matrix if the requested one is not positive semi-definite; default False
        return_new_corrs (bool): return the nearest correlation matrix that is positive semi-definite used to generate data; default False
        nit (int): number of iterations to search for the nearest positive-semi-definite correlation matrix is the requested correlation matrix is not PSD; default 100

    Returns:
        np.ndarray: 2d numpy array of correlated data organized as num_obs x num_features
    """

    if seed is not None:
        np.random.seed(seed)

    if isinstance(mu, list):
        assert len(mu) == num_features, "Number of means must match number of features"
    else:
        mu = [mu] * num_features
    if isinstance(sigma, list):
        assert len(sigma) == num_features, "Number of sds must match number of features"
    else:
        sigma = [sigma] * num_features

    if isinstance(corrs, np.ndarray) and corrs.ndim == 2:
        assert (
            corrs.shape[0] == corrs.shape[1]
            and np.allclose(corrs, corrs.T)
            and np.allclose(np.diagonal(corrs), np.ones_like(np.diagonal(corrs)))
        ), "Correlation matrix must be square symmetric"
    elif (isinstance(corrs, np.ndarray) and corrs.ndim == 1) or isinstance(corrs, list):
        assert (
            len(corrs) == (num_features * (num_features - 1)) / 2
        ), "(num_features * (num_features - 1) / 2) correlation values are required for a flattened array or list"
        corrs = squareform(corrs)
        np.fill_diagonal(corrs, 1.0)
    elif isinstance(corrs, float):
        corrs = np.array([corrs] * int(((num_features * (num_features - 1)) / 2)))
        corrs = squareform(corrs)
        np.fill_diagonal(corrs, 1.0)
    else:
        raise ValueError(
            "Correlations must be num_features x num_feature, flattend numpy array/list or scalar"
        )

    if not isPSD(corrs):
        if forcePSD:
            # Tell user their correlations are being recomputed if they didnt ask to save them as they might not realize
            if not return_new_corrs:
                print(
                    "Correlation matrix is not positive semi-definite. Solved for new correlation matrix."
                )
            _corrs = np.array(nearestPSD(corrs, nit))

        else:
            raise ValueError(
                "Correlation matrix is not positive semi-definite. Pymer4 will not generate inaccurate multivariate data. Use the forcePD argument to automatically solve for the closest desired correlation matrix."
            )
    else:
        _corrs = corrs

    # Rescale correlation matrix by variances, given standard deviations of features
    sd = np.diag(sigma)
    # R * Vars = R * SD * SD
    cov = _corrs.dot(sd.dot(sd))
    X = np.random.multivariate_normal(mu, cov, size=num_obs)

    if return_new_corrs:
        return X, _corrs
    else:
        return X