from __future__ import division
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform
from pymer4.utils import isPSD, nearestPSD
from pymer4.stats import discrete_inverse_logit
__all__ = ["easy_multivariate_normal", "simulate_lm", "simulate_lmm"]
__author__ = ["Eshin Jolly"]
__license__ = "MIT"
[docs]def simulate_lm(
num_obs,
num_coef,
coef_vals=None,
corrs=None,
mus=0.0,
sigmas=1.0,
noise_params=(0, 1),
family="gaussian",
seed=None,
):
"""
Function to quickly simulate a regression model dataset, with continuous predictors.
Provided a number of observations, number of coefficients, and optionally correlations between predictors, means, and standard deviations of predictors, returns a pandas dataframe with simulated data that can be used to estimate a linear regression using Lm(). Using the family='binomial' argument can generate discrete dependent variable values for use with logistic regression.
Defaults to returning standard normal (mu = 0; sigma = 1) predictors with no explicit correlations.
Args:
num_obs (int): number of total observations, i.e. rows of data
num_coef (int): number of coefficients/regressors, i.e. columns of data
coef_vals (list,optional): "true" values of coefficients to generate data. If not provided will be randomly generated. Must include a coefficient for the intercept as well (i.e. mean of data)
corrs (ndarray,list,float): correlations between coefficients provided as 2d num_coef x num_coef, 1d flattend numpy array/list of length (num_features * (num_features-1)) / 2, or a float to be treated as the same correlation between all coefficients
mus (float/list/ndarray): means of columns of predictors
sigmas (float/list/ndarray): stds of columns of predictors
noise_params (tup, optional): mean and std of noise added to simulated data
family (str): distribution family for the dependent variable. Currently only 'gaussian' (continuous DV) or 'binomial' (discrete DV) are available.
seed (int): seed for reproducible random number generation
Returns:
Multiple:
- **data** (*pd.DataFrame*): dataframe organized as num_obs x num_coef
- **coefs** (*np.array*): ground-truth coefficient values
"""
if seed is not None:
np.random.seed(seed)
if coef_vals is not None:
if len(coef_vals) - num_coef == 0:
raise ValueError(
"Missing one coefficient value. Did you provide a value for the intercept term?"
)
else:
assert (
len(coef_vals) == num_coef + 1
), "Number of coefficient values should be num_coef + 1 (for intercept)"
b = coef_vals
else:
b = np.random.rand(num_coef + 1)
if isinstance(mus, list) or isinstance(mus, np.ndarray):
assert len(mus) == len(b) - 1, "mus must match number of num_coef"
if isinstance(sigmas, list) or isinstance(sigmas, np.ndarray):
assert len(sigmas) == len(b) - 1, "sigmas must match number of num_coef"
assert (
isinstance(noise_params, tuple) and len(noise_params) == 2
), "noise_params should be a tuple of (mean,std)"
# Generate random design matrix
if corrs is not None:
X = easy_multivariate_normal(num_obs, num_coef, corrs, mus, sigmas, seed)
else:
X = np.random.normal(mus, sigmas, size=(num_obs, num_coef))
# Add intercept
X = np.column_stack([np.ones((num_obs, 1)), X])
# Generate data
Y = np.dot(X, b) + np.random.normal(*noise_params, size=num_obs)
# Apply transform if not linear model
if family == "binomial":
Y = discrete_inverse_logit(Y)
dat = pd.DataFrame(
np.column_stack([Y, X[:, 1:]]),
columns=["DV"] + ["IV" + str(elem + 1) for elem in range(X.shape[1] - 1)],
)
return dat, b
[docs]def simulate_lmm(
num_obs,
num_coef,
num_grps,
coef_vals=None,
corrs=None,
grp_sigmas=0.25,
mus=0.0,
sigmas=1.0,
noise_params=(0, 1),
family="gaussian",
seed=None,
):
"""
Function to quickly simulate a multi-level regression model dataset, with continuous predictors.
Provided a number of observations, number of coefficients, number of groups/clusters,
and optionally correlations between predictors, means, and standard deviations of predictors,
returns a pandas dataframe with simulated data that can be used to estimate a multi-level model using Lmer(). Using the family='binomial' argument can generate discrete dependent variable values for use with logistic multi-level models.
Defaults to returning standard normal (mu = 0; sigma = 1) predictors with no explicit correlations and low variance between
groups (sigma = .25).
Args:
num_obs (int): number of observations per cluster/stratum/group
num_coef (int): number of coefficients/regressors, i.e. columns of data
num_grps (int): number of cluster/stratums/groups
coef_vals (list,optional): "true" values of coefficients to generate data. If not provided will be randomly generated. Must include a coefficient for the intercept as well (i.e. mean of data)
corrs (ndarray,list,float): correlations between coefficients provided as 2d num_coef x num_coef, 1d flattend numpy array/list of length (num_features * (num_features-1)) / 2, or a float to be treated
as the same correlation between all coefficients
grp_sigmas (int or list): grp level std around population coefficient values; can be a single value in which case same std is applied around all coefficients or a list for different std; default .25
mus (float/list/ndarray): means of columns of predictors
sigmas (float/list/ndarray): stds of columns of predictors
noise_params (tup, optional): mean and std of noise added to each group's simulated data
family (str): distribution family for the dependent variable. Currently only 'gaussian' (continuous DV) or 'binomial' (discrete DV) are available.
seed (int): seed for reproducible random number generation
Returns:
Multiple:
- **data** (*pd.DataFrame*): dataframe organized as num_obs x num_coef
- **blups** (*pd.DataFrame*): ground-truth group/cluster level coefficients, organized as num_grps x num_coef (i.e. BLUPs)
- **coefs** (*np.array*): ground-truth population-level coefficients
"""
if seed is not None:
np.random.seed(seed)
if coef_vals:
if len(coef_vals) - num_coef == -1:
raise ValueError(
"Missing one coefficient value. Did you provide a value for the intercept term?"
)
else:
assert len(coef_vals) == num_coef + 1
b = coef_vals
else:
b = np.random.rand(num_coef + 1)
assert (
isinstance(noise_params, tuple) and len(noise_params) == 2
), "noise_params should be a tuple of (mean,std)"
assert (
isinstance(grp_sigmas, int)
or isinstance(grp_sigmas, list)
or isinstance(grp_sigmas, float)
), "grp_sigmas should be scalar value or list"
if not isinstance(grp_sigmas, list):
grp_sigmas = [grp_sigmas] * (num_coef + 1)
else:
assert len(grp_sigmas) == len(
b
), "The length of a list of grp_sigmas must match the num_coef plus intercept!"
if isinstance(mus, list) or isinstance(mus, np.ndarray):
assert len(mus) == len(b) - 1, "mus must match number of num_coef"
if isinstance(sigmas, list) or isinstance(sigmas, np.ndarray):
assert len(sigmas) == len(b) - 1, "sigmas must match number of num_coef"
# Generate group paramaters (BLUPs)
blups = np.array(
[np.random.normal(est, sigma, num_grps) for est, sigma in zip(b, grp_sigmas)]
).T
# Generate data
for grp in range(blups.shape[0]):
# Create a random design matrix per group
if corrs:
x = easy_multivariate_normal(num_obs, num_coef, corrs, mus, sigmas, seed)
else:
x = np.random.normal(mus, sigmas, size=(num_obs, num_coef))
x = np.column_stack([np.ones((num_obs, 1)), x])
# Use blups to generate group data
y = np.dot(x, blups[grp, :]) + np.random.normal(*noise_params, size=num_obs)
if family == "binomial":
y = discrete_inverse_logit(y)
if grp == 0:
x_all, y_all = x, y
else:
y_all = np.append(y_all, y, axis=0)
x_all = np.append(x_all, x, axis=0)
grp_ids = np.array([[elem] * num_obs for elem in range(1, num_grps + 1)]).ravel()
data = pd.DataFrame(
np.column_stack([y_all, x_all[:, 1:], grp_ids]),
columns=["DV"]
+ ["IV" + str(elem + 1) for elem in range(x_all.shape[1] - 1)]
+ ["Group"],
)
blups = pd.DataFrame(
blups,
columns=["Intercept"]
+ ["IV" + str(elem + 1) for elem in range(x_all.shape[1] - 1)],
index=["Grp" + str(elem + 1) for elem in range(num_grps)],
)
return data, blups, b
[docs]def easy_multivariate_normal(
num_obs,
num_features,
corrs,
mu=0.0,
sigma=1.0,
seed=None,
forcePSD=True,
return_new_corrs=False,
nit=100,
):
"""
Function to more easily generate multivariate normal samples provided a correlation matrix or list of correlations (upper triangle of correlation matrix) instead of a covariance matrix. Defaults to returning approximately standard normal (mu = 0; sigma = 1) variates. Unlike numpy, if the desired correlation matrix is not positive-semi-definite, will by default issue a warning and find the nearest PSD correlation matrix and generate data with this matrix. This new matrix can optionally be returned used the return_new_corrs argument.
Args:
num_obs (int): number of observations/samples to generate (rows)
corrs (ndarray/list/float): num_features x num_features 2d array, flattend numpy array of length (num_features * (num_features-1)) / 2, or scalar for same correlation on all off-diagonals
num_features (int): number of features/variables/dimensions to generate (columns)
mu (float/list): mean of each feature across observations; default 0.0
sigma (float/list): sd of each feature across observations; default 1.0
forcePD (bool): whether to find and use a new correlation matrix if the requested one is not positive semi-definite; default False
return_new_corrs (bool): return the nearest correlation matrix that is positive semi-definite used to generate data; default False
nit (int): number of iterations to search for the nearest positive-semi-definite correlation matrix is the requested correlation matrix is not PSD; default 100
Returns:
np.ndarray: 2d numpy array of correlated data organized as num_obs x num_features
"""
if seed is not None:
np.random.seed(seed)
if isinstance(mu, list):
assert len(mu) == num_features, "Number of means must match number of features"
else:
mu = [mu] * num_features
if isinstance(sigma, list):
assert len(sigma) == num_features, "Number of sds must match number of features"
else:
sigma = [sigma] * num_features
if isinstance(corrs, np.ndarray) and corrs.ndim == 2:
assert (
corrs.shape[0] == corrs.shape[1]
and np.allclose(corrs, corrs.T)
and np.allclose(np.diagonal(corrs), np.ones_like(np.diagonal(corrs)))
), "Correlation matrix must be square symmetric"
elif (isinstance(corrs, np.ndarray) and corrs.ndim == 1) or isinstance(corrs, list):
assert (
len(corrs) == (num_features * (num_features - 1)) / 2
), "(num_features * (num_features - 1) / 2) correlation values are required for a flattened array or list"
corrs = squareform(corrs)
np.fill_diagonal(corrs, 1.0)
elif isinstance(corrs, float):
corrs = np.array([corrs] * int(((num_features * (num_features - 1)) / 2)))
corrs = squareform(corrs)
np.fill_diagonal(corrs, 1.0)
else:
raise ValueError(
"Correlations must be num_features x num_feature, flattend numpy array/list or scalar"
)
if not isPSD(corrs):
if forcePSD:
# Tell user their correlations are being recomputed if they didnt ask to save them as they might not realize
if not return_new_corrs:
print(
"Correlation matrix is not positive semi-definite. Solved for new correlation matrix."
)
_corrs = np.array(nearestPSD(corrs, nit))
else:
raise ValueError(
"Correlation matrix is not positive semi-definite. Pymer4 will not generate inaccurate multivariate data. Use the forcePD argument to automatically solve for the closest desired correlation matrix."
)
else:
_corrs = corrs
# Rescale correlation matrix by variances, given standard deviations of features
sd = np.diag(sigma)
# R * Vars = R * SD * SD
cov = _corrs.dot(sd.dot(sd))
X = np.random.multivariate_normal(mu, cov, size=num_obs)
if return_new_corrs:
return X, _corrs
else:
return X