Source code for pymer4.models.Lmer

"""
Pymer4 Lmer Class
=================

Main class to wrap R's lme4 library
"""


from copy import copy
from rpy2.robjects.packages import importr
import rpy2.robjects as robjects
from rpy2.rinterface_lib import callbacks
import rpy2.rinterface as rinterface
from rpy2.robjects.conversion import localconverter
from rpy2.robjects import numpy2ri
import warnings
import traceback
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ..utils import (
    _sig_stars,
    _perm_find,
    _return_t,
    _to_ranks_by_group,
)
from ..bridge import con2R, pandas2R, R2pandas, R2numpy
from pandas.api.types import CategoricalDtype

# Import R libraries we need
base = importr("base")
stats = importr("stats")

# Make a reference to the default R console writer from rpy2
consolewrite_warning_backup = callbacks.consolewrite_warnerror
consolewrite_print_backup = callbacks.consolewrite_print


[docs]class Lmer(object): """ Model class to hold data outputted from fitting lmer in R and converting to Python object. This class stores as much information as it can about a merMod object computed using lmer and lmerTest in R. Most attributes will not be computed until the fit method is called. Args: formula (str): Complete lmer-style model formula data (pandas.core.frame.DataFrame): input data family (string): what distribution family (i.e.) link function to use for the generalized model; default is gaussian (linear model) Attributes: fitted (bool): whether model has been fit formula (str): model formula data (pd.DataFrame): model copy of input data grps (dict): groups and number of observations per groups recognized by lmer design_matrix (pd.DataFrame): model design matrix determined by lmer AIC (float): model Akaike information criterion BIC (float): model Bayesian information criterion logLike (float): model Log-likelihood family (string): model family warnings (list): warnings output from R or Python ranef (pd.DataFrame/list): cluster-level differences from population parameters, i.e. difference between coefs and fixefs; returns list if multiple cluster variables are used to specify random effects (e.g. subjects and items) fixef (pd.DataFrame/list): cluster-level parameters; returns list if multiple cluster variables are used to specify random effects (e.g. subjects and items) coefs (pandas.core.frame.DataFrame/list): model summary table of population parameters ranef_var (pd.DataFrame): random effects variances ranef_corr(pd.DataFrame): random effects correlations residuals (numpy.ndarray): model residuals fits (numpy.ndarray): model fits/predictions model_obj (lmer model): rpy2 lmer model object factors (dict): factors used to fit the model if any """ def __init__(self, formula, data, family="gaussian"): self.family = family implemented_fams = [ "gaussian", "binomial", "gamma", "inverse_gaussian", "poisson", ] if self.family not in implemented_fams: raise ValueError( "Family must be one of: gaussian, binomial, gamma, inverse_gaussian or poisson!" ) self.fitted = False self.formula = formula.replace(" ", "") self.data = copy(data) self.grps = None self.AIC = None self.BIC = None self.logLike = None self.warnings = [] self.ranef_var = None self.ranef_corr = None self.ranef = None self.fixef = None self.design_matrix = None self.residuals = None self.coefs = None self.model_obj = None self.factors = None self.contrast_codes = None self.ranked_data = False self.marginal_estimates = None self.marginal_contrasts = None self.sig_type = None self.factors_prev_ = None self.contrasts = None def __repr__(self): out = "{}(fitted = {}, formula = {}, family = {})".format( self.__class__.__module__, self.fitted, self.formula, self.family ) return out def _make_factors(self, factor_dict, ordered=False): """ Covert specific columns to R-style factors. Default scheme is dummy coding where reference is 1st level provided. Alternative is orthogonal polynomial contrasts. User can also specific custom contrasts. Args: factor_dict: (dict) dictionary with column names specified as keys and values as a list for dummy/treatment/polynomial contrast; a dict with keys as factor leves and values as desired comparisons in human readable format ordered: (bool) whether to interpret factor_dict values as dummy-coded (1st list item is reference level) or as polynomial contrasts (linear contrast specified by ordered of list items); ignored if factor_dict values are not a list Returns: pandas.core.frame.DataFrame: copy of original data with factorized columns Examples: Dummy/treatment contrasts with 'A' as the reference level and other contrasts as 'B'-'A' and 'C'-'A' >>> _make_factors(factor_dict={'factor': ['A','B','C']}) Same as above but a linear contrast (and automatically computed quadratic contrast) of A < B < C >>> _make_factors(factor_dict={'factor': ['A','B','C']}, ordered=True) Custom contrast of 'A' - mean('B', 'C') >>> _make_factors(factor_dict={'factor': {'A': 1, 'B': -0.5, 'C': -0.5}}) """ errormsg = "factors should be specified as a dictionary with values as one of:\n1) a list with factor levels in the desired order for dummy/treatment/polynomial contrasts\n2) a dict with keys as factor levels and values as desired comparisons in human readable format" # We create a copy of data because we need to convert dtypes to categories and then pass them to R. However, resetting categories on the *same* dataframe and passing to R repeatedly (e.g. multiple calls to .fit with different contrasats) does not work as R only uses the 1st category spec. So instead we create a copy and return that copy to get used by .fit out = {} df = self.data.copy() if not isinstance(factor_dict, dict): raise TypeError(errormsg) for factor, contrasts in factor_dict.items(): # First convert to a string type because R needs string based categories df[factor] = df[factor].apply(str) # Treatment/poly contrasts if isinstance(contrasts, list): # Ensure that all factor levels are accounted for if not all([e in contrasts for e in df[factor].unique()]): raise ValueError( "Not all factor levels are specified in the desired contrast" ) # Define and apply a pandas categorical type in the same order as requested, which will get converted to the right factor levels in R cat = CategoricalDtype(contrasts) df[factor] = df[factor].astype(cat) if ordered: # Polynomial contrasts con_codes = np.array(stats.contr_poly(len(contrasts))) else: # Treatment/dummy contrasts con_codes = np.array(stats.contr_treatment(len(contrasts))) out[factor] = con_codes # Custom contrasts (human readable) elif isinstance(contrasts, dict): factor_levels = list(contrasts.keys()) cons = list(contrasts.values()) # Ensure that all factor levels are accounted for if not all([e in factor_levels for e in df[factor].unique()]): raise ValueError( "Not all factor levels are specified in the desired contrast" ) # Define and apply categorical type in the same order as requested cat = CategoricalDtype(factor_levels) df[factor] = df[factor].astype(cat) # Compute desired contrasts in R format along with addition k - 1 contrasts not specified con_codes = con2R(cons) out[factor] = con_codes else: raise TypeError(errormsg) self.factors = factor_dict self.contrast_codes = out # have to do local convertion because dict values are numpy arrays :( with localconverter(robjects.default_converter + numpy2ri.converter): out = robjects.ListVector(out) return out, df def _refit_orthogonal(self): """ Refit a model with factors organized as polynomial contrasts to ensure valid type-3 SS calculations with using `.anova()`. Previous factor specifications are stored in `model.factors_prev_`. """ self.factors_prev_ = copy(self.factors) self.contrast_codes_prev_ = copy(self.contrast_codes) # Create orthogonal polynomial contrasts for all factors, by creating a list of unique # factor levels as self._make_factors will handle the rest new_factors = {} for factor in self.factors.keys(): new_factors[factor] = sorted(list(map(str, self.data[factor].unique()))) self.fit( factors=new_factors, ordered=True, summarize=False, permute=self._permute, conf_int=self._conf_int, REML=self._REML, )
[docs] def anova(self, force_orthogonal=False): """ Return a type-3 ANOVA table from a fitted model. Like R, this method does not ensure that contrasts are orthogonal to ensure correct type-3 SS computation. However, the force_orthogonal flag can refit the regression model with orthogonal polynomial contrasts automatically guaranteeing valid SS type 3 inferences. Note that this will overwrite factors specified in the last call to `.fit()` Args: force_orthogonal (bool): whether factors in the model should be recoded using polynomial contrasts to ensure valid type-3 SS calculations. If set to True, previous factor specifications will be saved in `model.factors_prev_`; default False Returns: pd.DataFrame: Type 3 ANOVA results """ if self.factors: # Model can only have factors if it's been fit if force_orthogonal: self._refit_orthogonal() elif not self.fitted: raise ValueError("Model must be fit before ANOVA table can be generated!") rstring = """ function(model){ df<- anova(model) df } """ anova = robjects.r(rstring) self.anova_results = R2pandas(anova(self.model_obj)) if self.anova_results.shape[1] == 6: self.anova_results.columns = [ "SS", "MS", "NumDF", "DenomDF", "F-stat", "P-val", ] self.anova_results["Sig"] = self.anova_results["P-val"].apply( lambda x: _sig_stars(x) ) elif self.anova_results.shape[1] == 4: warnings.warn( "MODELING FIT WARNING! Check model.warnings!! P-value computation did not occur because lmerTest choked. Possible issue(s): ranefx have too many parameters or too little variance..." ) self.anova_results.columns = ["DF", "SS", "MS", "F-stat"] if force_orthogonal: print( "SS Type III Analysis of Variance Table with Satterthwaite approximated degrees of freedom:\n(NOTE: Model refit with orthogonal polynomial contrasts)" ) else: print( "SS Type III Analysis of Variance Table with Satterthwaite approximated degrees of freedom:\n(NOTE: Using original model contrasts, orthogonality not guaranteed)" ) return self.anova_results
def _get_ngrps(self, unsum, base): """Get the groups information from the model as a dictionary""" # This works for 2 grouping factors ns = unsum.rx2("ngrps") names = base.names(self.model_obj.slots["flist"]) self.grps = dict(zip(names, ns)) def _set_R_stdout(self, verbose): """Adjust whether R prints to the console (often as a duplicate) based on the verbose flag of a method call. Reference to rpy2 interface here: https://bit.ly/2MsrufO""" if verbose: # use the default logging in R callbacks.consolewrite_warnerror = consolewrite_warning_backup else: # Create a list buffer to catch messages and discard them buf = [] def _f(x): buf.append(x) callbacks.consolewrite_warnerror = _f
[docs] def fit( self, conf_int="Wald", n_boot=500, factors=None, permute=False, ordered=False, verbose=False, REML=True, rank=False, rank_group="", rank_exclude_cols=[], no_warnings=False, control="", old_optimizer=False, **kwargs, ): """ Main method for fitting model object. Will modify the model's data attribute to add columns for residuals and fits for convenience. Factors should be specified as a dictionary with values as a list or themselves a dictionary of *human readable* contrasts *not* R-style contrast codes as these will be auto-converted for you. See the factors docstring and examples below. After fitting, the .factors attribute will store a reference to the user-specified dictionary. The .contrast_codes model attributes will store the requested comparisons in converted R format. Note that Lmer estimate naming conventions differs a bit from R: Lmer.coefs = summary(model); Lmer.fixefs = coefs(model); Lmer.ranef = ranef(model) Args: conf_int (str): which method to compute confidence intervals; 'profile', 'Wald' (default), or 'boot' (parametric bootstrap) n_boot (int): number of bootstrap intervals if bootstrapped confidence intervals are requests; default 500 factors (dict): dictionary with column names specified as keys and values as a list for dummy/treatment/polynomial contrast or a dict with keys as factor leves and values as desired comparisons in human readable format See examples below permute (int): if non-zero, computes parameter significance tests by permuting test stastics rather than parametrically. Permutation is done by shuffling observations within clusters to respect random effects structure of data. ordered (bool): whether factors should be treated as ordered polynomial contrasts; this will parameterize a model with K-1 orthogonal polynomial regressors beginning with a linear contrast based on the factor order provided. Ordering applies to **all** contrasts!; default is False summarize/summary (bool): whether to print a model summary after fitting; default is True verbose (bool): whether to print when and which model and confidence interval are being fitted REML (bool): whether to fit using restricted maximum likelihood estimation instead of maximum likelihood estimation; default True rank (bool): covert predictors in model formula to ranks by group prior to estimation. Model object will still contain original data not ranked data; default False rank_group (str): column name to group data on prior to rank conversion rank_exclude_cols (list/str): columns in model formula to not apply rank conversion to no_warnings (bool): turn off auto-printing warnings messages; warnings are always stored in the .warnings attribute; default False control (str): string containing options to be passed to (g)lmer control. See https://bit.ly/2OQONTH for options old_optimizer (bool): use the old bobyqa optimizer that was the default in lmer4 <= 1.1_20, i.e. prior to 02/04/2019. This is not compatible with the control setting as it's meant to be a quick shorthand (e.g. to reproduce previous model results). However, the same setting can be manually requested using the control option if preferred. (For optimizer change discussions see: https://bit.ly/2MrP9Nq and https://bit.ly/2Vx5jte ) Returns: pd.DataFrame: R/statsmodels style summary Examples: The following examples demonstrate how to treat variables as categorical factors. Dummy-Coding: Treat Col1 as a factor which 3 levels: A, B, C. Use dummy-coding with A as the reference level. Model intercept will be mean of A, and parameters will be B-A, and C-A. >>> model.fit(factors = {"Col1": ['A','B','C']}) Orthogonal Polynomials: Treat Col1 as a factor which 3 levels: A, B, C. Estimate a linear contrast of C > B > A. Model intercept will be grand-mean of all levels, and parameters will be linear contrast, and orthogonal polynomial contrast (auto-computed). >>> model.fit(factors = {"Col1": ['A','B','C']}, ordered=True) Custom-contrast: Treat Col1 as a factor which 3 levels: A, B, C. Compare A to the mean of B and C. Model intercept will be the grand-mean of all levels, and parameters will be the desired contrast, a well as an automatically determined orthogonal contrast. >>> model.fit(factors = {"Col1": {'A': 1, 'B': -.5, 'C': -.5}})) Here is an example specifying stricter deviance and paramter values stopping criteria. >>> model.fit(control="optCtrl = list(ftol_abs=1e-8, xtol_abs=1e-8)") Here is an example specifying a different optimizer in addition to stricter deviance and paramter values stopping criteria. >>> model.fit(control="optimizer='Nelder_Mead', optCtrl = list(FtolAbs=1e-8, XtolRel=1e-8)") Here is an example using the default optimization in previous versions of lme4 prior to the 2019 update. >>> model.fit(old_optimizer=True) """ # Alllow summary or summarize for compatibility if "summary" in kwargs and "summarize" in kwargs: raise ValueError( "You specified both summary and summarize, please prefer summarize" ) summarize = kwargs.pop("summarize", True) summarize = kwargs.pop("summary", summarize) # Save params for future calls self._permute = permute self._conf_int = conf_int self._REML = REML if self.family == "gaussian" else False self._set_R_stdout(verbose) if permute is True: raise TypeError( "permute should 'False' or the number of permutations to perform" ) if old_optimizer: if control: raise ValueError( "Must specify EITHER control OR old_optimizer not both" ) else: control = "optimizer='bobyqa'" if factors: contrasts, dat = self._make_factors(factors, ordered) else: contrasts = rinterface.NULL dat = self.data if rank: if not rank_group: raise ValueError("rank_group must be provided if rank is True") dat = _to_ranks_by_group( self.data, rank_group, self.formula, rank_exclude_cols ) if factors and (set(factors.keys()) != set(rank_exclude_cols)): w = "Factors and ranks requested, but factors are not excluded from rank conversion. Are you sure you wanted to do this?" warnings.warn(w) self.warnings.append(w) if conf_int == "boot": self.sig_type = "bootstrapped" else: if permute: self.sig_type = "permutation" + " (" + str(permute) + ")" else: self.sig_type = "parametric" data = pandas2R(dat) if self.family == "gaussian": _fam = "gaussian" if verbose: print( f"Fitting linear model using lmer with {conf_int} confidence intervals...\n" ) lmer = importr("lmerTest") lmc = robjects.r(f"lmerControl({control})") self.model_obj = lmer.lmer( self.formula, data=data, REML=REML, control=lmc, contrasts=contrasts ) else: if verbose: print( f"Fitting generalized linear model using glmer (family {self.family}) with {conf_int} confidence intervals...\n" ) lmer = importr("lme4") if self.family == "inverse_gaussian": _fam = "inverse.gaussian" elif self.family == "gamma": _fam = "Gamma" else: _fam = self.family lmc = robjects.r(f"glmerControl({control})") self.model_obj = lmer.glmer( self.formula, data=data, family=_fam, control=lmc, contrasts=contrasts, ) # Store design matrix and get number of IVs for inference design_matrix = stats.model_matrix(self.model_obj) # rpy2 > 3.4 returns a numpy array that can be empty but has shape (obs x IVs) if isinstance(design_matrix, np.ndarray): if design_matrix.shape[1] > 0: self.design_matrix = pd.DataFrame(design_matrix) num_IV = self.design_matrix.shape[1] else: num_IV = 0 # rpy2 < 3.4 returns an R matrix object with a length elif len(design_matrix): self.design_matrix = R2pandas(base.data_frame(design_matrix)) num_IV = self.design_matrix.shape[1] else: num_IV = 0 if permute and verbose: print("Using {} permutations to determine significance...".format(permute)) summary = base.summary(self.model_obj) unsum = base.unclass(summary) # Do scalars first cause they're easier # Get group names separately cause rpy2 > 2.9 is weird and doesnt return them above try: self._get_ngrps(unsum, base) except Exception as _: print(traceback.format_exc()) raise Exception( "The rpy2, lme4, or lmerTest API appears to have changed again. Please file a bug report at https://github.com/ejolly/pymer4/issues with your R, Python, rpy2, lme4, and lmerTest versions and the OS you're running pymer4 on. Apologies." ) # self.AIC = unsum.rx2("AICtab")[0] # the above is incorrect. The AICtab of summary.lmerMod gives the deviance (which in this context is really # -2 LogLik, NOT the AIC self.AIC = R2numpy(stats.AIC(self.model_obj))[0] self.BIC = R2numpy(stats.BIC(self.model_obj))[0] self.logLike = R2numpy(unsum.rx2("logLik"))[0] # First check for lme4 printed messages (e.g. convergence info is usually here instead of in warnings) fit_messages = unsum.rx2("optinfo").rx2("conv").rx2("lme4").rx2("messages") # Then check warnings for additional stuff fit_warnings = unsum.rx2("optinfo").rx2("warnings") try: fit_warnings = [fw for fw in fit_warnings] except TypeError: fit_warnings = [] try: fit_messages = [fm for fm in fit_messages] except TypeError: fit_messages = [] fit_messages_warnings = fit_warnings + fit_messages if fit_messages_warnings: self.warnings.extend(fit_messages_warnings) if not no_warnings: for warning in self.warnings: if isinstance(warning, list) | isinstance(warning, np.ndarray): for w in warning: print(w + " \n") else: print(warning + " \n") else: self.warnings = [] # Coefficients, and inference statistics if num_IV != 0: if self.family in ["gaussian", "gamma", "inverse_gaussian", "poisson"]: rstring = ( """ function(model){ out.coef <- data.frame(unclass(summary(model))$coefficients) out.ci <- data.frame(confint(model,method='""" + conf_int + """',nsim=""" + str(n_boot) + """)) n <- c(rownames(out.ci)) idx <- max(grep('sig',n)) out.ci <- out.ci[-seq(1:idx),] out <- cbind(out.coef,out.ci) list(out,rownames(out)) } """ ) estimates_func = robjects.r(rstring) out_summary, out_rownames = estimates_func(self.model_obj) df = R2pandas(out_summary) dfshape = df.shape[1] df.index = list(out_rownames) # gaussian if dfshape == 7: df.columns = [ "Estimate", "SE", "DF", "T-stat", "P-val", "2.5_ci", "97.5_ci", ] df = df[ ["Estimate", "2.5_ci", "97.5_ci", "SE", "DF", "T-stat", "P-val"] ] # gamma, inverse_gaussian elif dfshape == 6: if self.family in ["gamma", "inverse_gaussian"]: df.columns = [ "Estimate", "SE", "T-stat", "P-val", "2.5_ci", "97.5_ci", ] df = df[ ["Estimate", "2.5_ci", "97.5_ci", "SE", "T-stat", "P-val"] ] else: # poisson df.columns = [ "Estimate", "SE", "Z-stat", "P-val", "2.5_ci", "97.5_ci", ] df = df[ ["Estimate", "2.5_ci", "97.5_ci", "SE", "Z-stat", "P-val"] ] # Incase lmerTest chokes it won't return p-values elif dfshape == 5 and self.family == "gaussian": if not permute: warnings.warn( "MODELING FIT WARNING! Check model.warnings!! P-value computation did not occur because lmerTest choked. Possible issue(s): ranefx have too many parameters or too little variance..." ) df.columns = ["Estimate", "SE", "T-stat", "2.5_ci", "97.5_ci"] df = df[["Estimate", "2.5_ci", "97.5_ci", "SE", "T-stat"]] elif self.family == "binomial": rstring = ( """ function(model){ out.coef <- data.frame(unclass(summary(model))$coefficients) out.ci <- data.frame(confint(model,method='""" + conf_int + """',nsim=""" + str(n_boot) + """)) n <- c(rownames(out.ci)) idx <- max(grep('sig',n)) out.ci <- out.ci[-seq(1:idx),] out <- cbind(out.coef,out.ci) odds <- exp(out.coef[1]) colnames(odds) <- "OR" probs <- odds / (1 + odds) colnames(probs) <- "Prob" odds.ci <- exp(out.ci) colnames(odds.ci) <- c("OR_2.5_ci","OR_97.5_ci") probs.ci <- odds.ci / (1 + odds.ci) if(ncol(probs.ci) == 1){ probs.ci = t(probs.ci) } colnames(probs.ci) <- c("Prob_2.5_ci","Prob_97.5_ci") out <- cbind(out,odds,odds.ci,probs,probs.ci) list(out,rownames(out)) } """ ) estimates_func = robjects.r(rstring) out_summary, out_rownames = estimates_func(self.model_obj) df = R2pandas(out_summary) df.index = list(out_rownames) df.columns = [ "Estimate", "SE", "Z-stat", "P-val", "2.5_ci", "97.5_ci", "OR", "OR_2.5_ci", "OR_97.5_ci", "Prob", "Prob_2.5_ci", "Prob_97.5_ci", ] df = df[ [ "Estimate", "2.5_ci", "97.5_ci", "SE", "OR", "OR_2.5_ci", "OR_97.5_ci", "Prob", "Prob_2.5_ci", "Prob_97.5_ci", "Z-stat", "P-val", ] ] if permute: perm_dat = dat.copy() dv_var = self.formula.split("~")[0].strip() grp_vars = list(self.grps.keys()) perms = [] for _ in range(permute): perm_dat[dv_var] = perm_dat.groupby(grp_vars)[dv_var].transform( lambda x: x.sample(frac=1) ) if self.family == "gaussian": perm_obj = lmer.lmer(self.formula, data=perm_dat, REML=REML) else: perm_obj = lmer.glmer(self.formula, data=perm_dat, family=_fam) perms.append(_return_t(perm_obj)) perms = np.array(perms) pvals = [] for c in range(df.shape[0]): if self.family in ["gaussian", "gamma", "inverse_gaussian"]: pvals.append(_perm_find(perms[:, c], df["T-stat"][c])) else: pvals.append(_perm_find(perms[:, c], df["Z-stat"][c])) df["P-val"] = pvals if "DF" in df.columns: df["DF"] = [permute] * df.shape[0] df = df.rename(columns={"DF": "Num_perm", "P-val": "Perm-P-val"}) else: df["Num_perm"] = [permute] * df.shape[0] df = df.rename(columns={"P-val": "Perm-P-val"}) if "P-val" in df.columns: df = df.assign(Sig=df["P-val"].apply(lambda x: _sig_stars(x))) elif "Perm-P-val" in df.columns: df = df.assign(Sig=df["Perm-P-val"].apply(lambda x: _sig_stars(x))) if (conf_int == "boot") and (permute is None): # We're computing parametrically bootstrapped ci's so it doesn't make sense to use approximation for p-values. Instead remove those from the output and make significant inferences based on whether the bootstrapped ci's cross 0. df = df.drop(columns=["P-val", "Sig"]) if "DF" in df.columns: df = df.drop(columns="DF") df["Sig"] = df.apply( lambda row: "*" if row["2.5_ci"] * row["97.5_ci"] > 0 else "", axis=1, ) # Because all models except lmm have no DF column make sure Num_perm gets put in the right place if permute: if self.family != "gaussian": cols = list(df.columns) col_order = cols[:-4] + ["Num_perm"] + cols[-4:-2] + [cols[-1]] df = df[col_order] self.coefs = df # Make sure the design matrix column names match population coefficients self.design_matrix.columns = self.coefs.index[:] else: self.coefs = None if permute or conf_int == "boot": print( "**NOTE**: Non-parametric inference only applies to fixed effects and none were estimated\n" ) self.fitted = True # Random effect variances and correlations varcor_NAs = ["NA", "N", robjects.NA_Character] # NOQA df = R2pandas(base.data_frame(unsum.rx2("varcor"))) ran_vars = df.query("var2 in @varcor_NAs").drop("var2", axis=1) ran_vars.index = ran_vars["grp"] ran_vars.drop("grp", axis=1, inplace=True) ran_vars.columns = ["Name", "Var", "Std"] ran_vars.index.name = None ran_vars.replace("NA", "", inplace=True) ran_vars = ran_vars.applymap( lambda x: np.nan if x == robjects.NA_Character else x ) ran_vars.replace(np.nan, "", inplace=True) ran_corrs = df.query("var2 not in @varcor_NAs").drop("vcov", axis=1) if ran_corrs.shape[0] != 0: ran_corrs.index = ran_corrs["grp"] ran_corrs.drop("grp", axis=1, inplace=True) ran_corrs.columns = ["IV1", "IV2", "Corr"] ran_corrs.index.name = None ran_corrs = ran_corrs.applymap( lambda x: np.nan if x == robjects.NA_Character else x ) ran_corrs.replace(np.nan, "", inplace=True) else: ran_corrs = None self.ranef_var = ran_vars self.ranef_corr = ran_corrs # Cluster (e.g subject) level coefficients rstring = """ function(model){ getIndex <- function(df){ orignames <- names(df) df <- transform(df, index=row.names(df)) names(df) <- append(orignames, c("index")) df } out <- coef(model) out <- lapply(out, getIndex) out } """ fixef_func = robjects.r(rstring) fixefs = fixef_func(self.model_obj) fixefs = [R2pandas(e).drop(columns=["index"]) for e in fixefs] if len(fixefs) > 1: if self.coefs is not None: f_corrected_order = [] for f in fixefs: f_corrected_order.append( pd.DataFrame( f[ list(self.coefs.index) + [ elem for elem in f.columns if elem not in self.coefs.index ] ] ) ) self.fixef = f_corrected_order else: self.fixef = list(fixefs) else: self.fixef = fixefs[0] if self.coefs is not None: self.fixef = self.fixef[ list(self.coefs.index) + [ elem for elem in self.fixef.columns if elem not in self.coefs.index ] ] # Sort column order to match population coefs # This also handles cases in which random slope terms exist in the model without corresponding fixed effects terms, which generates extra columns in this dataframe. By default put those columns *after* the fixed effect columns of interest (i.e. population coefs) # Cluster (e.g subject) level random deviations rstring = """ function(model){ uniquify <- function(df){ colnames(df) <- make.unique(colnames(df)) df } getIndex <- function(df){ df <- transform(df, index=row.names(df)) df } out <- lapply(ranef(model),uniquify) out <- lapply(out, getIndex) out } """ ranef_func = robjects.r(rstring) ranefs = ranef_func(self.model_obj) if len(ranefs) > 1: self.ranef = [R2pandas(e).drop(columns=["index"]) for e in ranefs] else: self.ranef = R2pandas(ranefs[0]).drop(columns=["index"]) # Model residuals rstring = """ function(model){ out <- resid(model) out } """ resid_func = robjects.r(rstring) self.residuals = np.array(resid_func(self.model_obj)) try: self.data["residuals"] = copy(self.residuals) except ValueError as _: # NOQA print( "**NOTE**: Column for 'residuals' not created in model.data, but saved in model.resid only. This is because you have rows with NaNs in your data.\n" ) # Model fits rstring = """ function(model){ out <- fitted(model) out } """ fit_func = robjects.r(rstring) self.fits = fit_func(self.model_obj) try: self.data["fits"] = copy(self.fits) except ValueError as _: # NOQA print( "**NOTE** Column for 'fits' not created in model.data, but saved in model.fits only. This is because you have rows with NaNs in your data.\n" ) if summarize: return self.summary()
[docs] def simulate(self, num_datasets, use_rfx=True, verbose=False): """ Simulate new responses based upon estimates from a fitted model. By default group/cluster means for simulated data will match those of the original data. Unlike predict, this is a non-deterministic operation because lmer will sample random-efects values for all groups/cluster and then sample data points from their respective conditional distributions. Args: num_datasets (int): number of simulated datasets to generate. Each simulation always generates a dataset that matches the size of the original data use_rfx (bool): wehther to match group/cluster means in simulated data verbose (bool): whether to print R messages to console Returns: np.ndarray: simulated data values """ self._set_R_stdout(verbose) if isinstance(num_datasets, float): num_datasets = int(num_datasets) if not isinstance(num_datasets, int): raise ValueError("num_datasets must be an integer") if use_rfx: re_form = "NULL" else: re_form = "NA" rstring = ( """ function(model){ out <- simulate(model,""" + str(num_datasets) + """,allow.new.levels=TRUE,re.form=""" + re_form + """) out } """ ) simulate_func = robjects.r(rstring) sims = simulate_func(self.model_obj) out = R2pandas(sims) return out
[docs] def predict( self, data, use_rfx=True, pred_type="response", skip_data_checks=False, verify_predictions=True, verbose=False, ): """ Make predictions given new data. Input must be a dataframe that contains the same columns as the model.matrix excluding the intercept (i.e. all the predictor variables used to fit the model). If using random effects to make predictions, input data must also contain a column for the group identifier that were used to fit the model random effects terms. Using random effects to make predictions only makes sense if predictions are being made about the same groups/clusters. If any predictors are categorical, you can skip verifying column names by setting `skip_data_checks=True`. Args: data (pandas.core.frame.DataFrame): input data to make predictions on use_rfx (bool): whether to condition on random effects when making predictions; Default True pred_type (str): whether the prediction should be on the 'response' scale (default); or on the 'link' scale of the predictors passed through the link function (e.g. log-odds scale in a logit model instead of probability values) skip_data_checks (bool): whether to skip checks that input data have the same columns as the original data the model were trained on. If predicting using a model trained with categorical variables it can be helpful to set this to False. Default True verify_predictions (bool): whether to ensure that the predicted data are not identical to original model fits. Only useful to set this to False when making predictions on the same data the model was fit on, but its faster to access these directly from model.fits or model.data['fits']. Default True verbose (bool): whether to print R messages to console Returns: np.ndarray: prediction values """ self._set_R_stdout(verbose) if self.design_matrix is None: raise ValueError( "No fixed effects were estimated so prediction is not possible!" ) if not skip_data_checks: required_cols = self.design_matrix.columns[1:] if not all([col in data.columns for col in required_cols]): raise ValueError( "Column names do not match all fixed effects model terms!\nThis may be a false error if some predictors are categorical, in which case you can bypass this check by setting skip_data_checks=True." ) if use_rfx: if not skip_data_checks: required_cols = set(list(required_cols) + list(self.grps.keys())) if not all([col in data.columns for col in required_cols]): raise ValueError( "Column names are missing random effects model grouping terms!" ) re_form = "NULL" else: re_form = "NA" rstring = ( """ function(model,new){ out <- predict(model,new,allow.new.levels=TRUE,re.form=""" + re_form + """,type='""" + pred_type + """') out } """ ) predict_func = robjects.r(rstring) preds = predict_func(self.model_obj, pandas2R(data)) if verify_predictions: self._verify_preds(preds, use_rfx) return preds
def _verify_preds(self, preds, use_rfx): """ Verify that the output of .predict given new data is not identitical to the model's fits from the data it was trained on. This is necessary because `predict()` in R will silently fallback to return the same predicted values when given input data with no matching columns, but with `use_rfx=False` Args: preds (np.array): output of self.predict() use_rfx (bool): same input parameter as self.predict Raises: ValueError: If predictions match model fits from original data """ training_preds = self.predict( self.data, use_rfx=use_rfx, skip_data_checks=True, verify_predictions=False ) mess = "(using rfx)" if use_rfx else "(without rfx)" if np.allclose(training_preds, preds): raise ValueError( f"Predictions are identitical to fitted values {mess}!!\nYou can ignore this error if you intended to predict using the same data the model was trained on by setting verify_predictions=False. If you didn't, then its likely that some or all of the column names in your test data don't match the column names from the data the model was trained on and you set skip_data_checks=True." )
[docs] def summary(self): """ Summarize the output of a fitted model. Returns: pd.DataFrame: R/statsmodels style summary """ if not self.fitted: raise RuntimeError("Model must be fitted to generate summary!") if self._REML: print("Linear mixed model fit by REML [’lmerMod’]") else: print("Linear mixed model fit by maximum likelihood ['lmerMod']") print("Formula: {}\n".format(self.formula)) print("Family: {}\t Inference: {}\n".format(self.family, self.sig_type)) print( "Number of observations: %s\t Groups: %s\n" % (self.data.shape[0], self.grps) ) print("Log-likelihood: %.3f \t AIC: %.3f\n" % (self.logLike, self.AIC)) print("Random effects:\n") print("%s\n" % (self.ranef_var.round(3))) if self.ranef_corr is not None: print("%s\n" % (self.ranef_corr.round(3))) else: print("No random effect correlations specified\n") if self.coefs is None: print("No fixed effects estimated\n") return else: print("Fixed effects:\n") return self.coefs.round(3)
[docs] def post_hoc( self, marginal_vars, grouping_vars=None, p_adjust="tukey", summarize=True, dof_asymptotic_at=99999, verbose=False, ): """ Post-hoc pair-wise tests corrected for multiple comparisons (Tukey method) implemented using the emmeans package. This method provide both marginal means/trends along with marginal pairwise differences. More info can be found at: https://cran.r-project.org/web/packages/emmeans/emmeans.pdf Args: marginal_var (str/list): what variable(s) to compute marginal means/trends for; unique combinations of factor levels of these variable(s) will determine family-wise error correction grouping_vars (str/list): what variable(s) to group on. Trends/means/comparisons of other variable(s), will be computed at each level of these variable(s) p_adjust (str): multiple comparisons adjustment method. One of: tukey, bonf, fdr, hochberg, hommel, holm, dunnet, mvt (monte-carlo multi-variate T, aka exact tukey/dunnet). Default tukey summarize (bool): output effects and contrasts or don't (always stored in model object as model.marginal_estimates and model.marginal_contrasts); default True dof_asymptotic_at (int, optional): at what number of observations to assuming a normal distribution and return z-stats instead of t-stats; Default 100000 which differs from emmeans default of 3000 verbose (bool): whether to print R messages to the console Returns: Multiple: - **marginal_estimates** (*pd.Dataframe*): unique factor level effects (e.g. means/coefs) - **marginal_contrasts** (*pd.DataFrame*): contrasts between factor levels Examples: Pairwise comparison of means of A at each level of B >>> model.post_hoc(marginal_vars='A',grouping_vars='B') Pairwise differences of slopes of C between levels of A at each level of B >>> model.post_hoc(marginal_vars='C',grouping_vars=['A','B']) Pairwise differences of each unique A,B cell >>> model.post_hoc(marginal_vars=['A','B']) """ self._set_R_stdout(verbose) if not marginal_vars: raise ValueError("Must provide marginal_vars") if not self.fitted: raise RuntimeError("Model must be fitted to generate post-hoc comparisons") if verbose and self.data.shape[0] >= dof_asymptotic_at: warnings.warn( f"Number of observations {self.data.shape[0]} exceeds dof_symptotic_at={dof_asymptotic_at}. Returning z-stats" ) if not isinstance(marginal_vars, list): marginal_vars = [marginal_vars] if p_adjust is None: p_adjust = "none" if grouping_vars and not isinstance(grouping_vars, list): grouping_vars = [grouping_vars] # Conditional vars can only be factor types if not all([elem in self.factors.keys() for elem in grouping_vars]): raise ValueError( "All grouping_vars must be existing categorical variables (i.e. factors)" ) # Need to figure out if marginal_vars is continuous or not to determine lstrends or emmeans call cont, factor = [], [] for var in marginal_vars: if not self.factors or var not in self.factors.keys(): cont.append(var) else: factor.append(var) if cont: if factor: raise ValueError( "With more than one marginal variable, all variables must be categorical factors. Mixing continuous and categorical variables is not supported. Try passing additional categorical factors to grouping_vars" "" ) else: if len(cont) > 1: raise ValueError( "Marginal variables can only contain one continuous variable" ) elif len(cont) == 1: if grouping_vars: # Emtrends; there's a bug for trends where options don't get set by default so an empty list is passed to R, see: https://bit.ly/2VJ9QZM cont = cont[0] if len(grouping_vars) > 1: g1 = grouping_vars[0] _conditional = "+".join(grouping_vars[1:]) rstring = ( """ function(model){ suppressMessages(library(emmeans)) out <- emtrends(model,pairwise ~ """ + g1 + """|""" + _conditional + """,var='""" + cont + """',adjust='""" + p_adjust + """',options=list(),lmer.df='satterthwaite',lmerTest.limit='""" + str(dof_asymptotic_at) + """') out }""" ) else: rstring = ( """ function(model){ suppressMessages(library(emmeans)) out <- emtrends(model,pairwise ~ """ + grouping_vars[0] + """,var='""" + cont + """',adjust='""" + p_adjust + """',options=list(),lmer.df='satterthwaite',lmerTest.limit='""" + str(dof_asymptotic_at) + """') out }""" ) else: raise ValueError( "grouping_vars are required with a continuous marginal_vars" ) else: if factor: _marginal = "+".join(factor) if grouping_vars: # emmeans with pipe _conditional = "+".join(grouping_vars) rstring = ( """ function(model){ suppressMessages(library(emmeans)) out <- emmeans(model,pairwise ~ """ + _marginal + """|""" + _conditional + """, adjust='""" + p_adjust + """',lmer.df='satterthwaite',lmerTest.limit='""" + str(dof_asymptotic_at) + """') out }""" ) else: # emmeans without pipe rstring = ( """ function(model){ suppressMessages(library(emmeans)) out <- emmeans(model,pairwise ~ """ + _marginal + """,adjust='""" + p_adjust + """',lmer.df='satterthwaite',lmerTest.limit='""" + str(dof_asymptotic_at) + """') out }""" ) else: raise ValueError("marginal_vars are not in model!") func = robjects.r(rstring) res = func(self.model_obj) emmeans = importr("emmeans") # Marginal estimates self.marginal_estimates = R2pandas(base.summary(res)[0]) # Resort columns effect_names = list(self.marginal_estimates.columns[:-4]) # this column name changes depending on whether we're doing post-hoc trends or means effname = effect_names[-1] sortme = effect_names[:-1] + ["Estimate", "2.5_ci", "97.5_ci", "SE", "DF"] # In emmeans (compared to lsmeans) the CI column names change too depending on how many factor variabls are in the model if "asymp.LCL" in self.marginal_estimates.columns: self.marginal_estimates = self.marginal_estimates.rename( columns={ effname: "Estimate", "df": "DF", "asymp.LCL": "2.5_ci", "asymp.UCL": "97.5_ci", } )[sortme] elif "lower.CL" in self.marginal_estimates.columns: self.marginal_estimates = self.marginal_estimates.rename( columns={ effname: "Estimate", "df": "DF", "lower.CL": "2.5_ci", "upper.CL": "97.5_ci", } )[sortme] else: raise ValueError( f"Cannot figure out what emmeans is naming marginal CI columns. Expected 'lower.CL' or 'asymp.LCL', but columns are {self.marginal_estimates.columns}" ) # Marginal Contrasts self.marginal_contrasts = R2pandas(base.summary(res)[1]) if "t.ratio" in self.marginal_contrasts.columns: rename_dict = { "t.ratio": "T-stat", "p.value": "P-val", "estimate": "Estimate", "df": "DF", "contrast": "Contrast", } sorted_names = [ "Estimate", "2.5_ci", "97.5_ci", "SE", "DF", "T-stat", "P-val", ] elif "z.ratio" in self.marginal_contrasts.columns: rename_dict = { "z.ratio": "Z-stat", "p.value": "P-val", "estimate": "Estimate", "df": "DF", "contrast": "Contrast", } sorted_names = [ "Estimate", "2.5_ci", "97.5_ci", "SE", "DF", "Z-stat", "P-val", ] else: raise ValueError( f"Cannot figure out what emmeans is naming contrast means columns. Expected 't.ratio' or 'z.ratio', but columns are: {self.marginal_contrasts.columns}" ) self.marginal_contrasts = self.marginal_contrasts.rename(columns=rename_dict) # Need to make another call to emmeans to get confidence intervals on contrasts confs = R2pandas(base.unclass(emmeans.confint_emmGrid(res))[1]) confs = confs.iloc[:, -2:] # Deal with changing column names again if "asymp.LCL" in confs.columns: confs = confs.rename( columns={"asymp.LCL": "2.5_ci", "asymp.UCL": "97.5_ci"} ) elif "lower.CL" in confs.columns: confs = confs.rename(columns={"lower.CL": "2.5_ci", "upper.CL": "97.5_ci"}) else: raise ValueError( f"Cannot figure out what emmeans is naming contrast CI columns. Expected 'lower.CL' or 'asymp.LCL', but columns are {self.marginal_estimates.columns}" ) self.marginal_contrasts = pd.concat([self.marginal_contrasts, confs], axis=1) # Resort columns effect_names = list(self.marginal_contrasts.columns[:-7]) sortme = effect_names + sorted_names self.marginal_contrasts = self.marginal_contrasts[sortme] self.marginal_contrasts["Sig"] = self.marginal_contrasts["P-val"].apply( _sig_stars ) if summarize: if ( p_adjust == "tukey" and self.marginal_contrasts.shape[0] >= self.marginal_estimates.shape[0] ): print( "P-values adjusted by tukey method for family of {} estimates".format( self.marginal_contrasts["Contrast"].nunique() ) ) elif p_adjust != "tukey": print( "P-values adjusted by {} method for {} comparisons".format( p_adjust, self.marginal_contrasts["Contrast"].nunique() ) ) return self.marginal_estimates.round(3), self.marginal_contrasts.round(3)
[docs] def plot_summary( self, figsize=(12, 6), error_bars="ci", ranef=True, axlim=None, plot_intercept=True, ranef_alpha=0.5, coef_fmt="o", orient="v", ranef_idx=0, ): """ Create a forestplot overlaying estimated coefficients with random effects (i.e. BLUPs). By default display the 95% confidence intervals computed during fitting. Args: error_bars (str): one of 'ci' or 'se' to change which error bars are plotted; default 'ci' ranef (bool): overlay BLUP estimates on figure; default True axlim (tuple): lower and upper limit of plot; default min and max of BLUPs plot_intercept (bool): plot the intercept estimate; default True ranef_alpha (float): opacity of random effect points; default .5 coef_fmt (str): matplotlib marker style for population coefficients ranef_idx (int): if multiple random effects clusters were specified this value indicates which one should be plotted; uses 0-based indexing; default 0 (first) Returns: plt.axis: matplotlib axis handle """ if not self.fitted: raise RuntimeError("Model must be fit before plotting!") if orient not in ["h", "v"]: raise ValueError("orientation must be 'h' or 'v'") if isinstance(self.fixef, list): m_ranef = self.fixef[ranef_idx] else: m_ranef = self.fixef m_fixef = self.coefs if not plot_intercept: m_ranef = m_ranef.drop("(Intercept)", axis=1) m_fixef = m_fixef.drop("(Intercept)", axis=0) if error_bars == "ci": col_lb = (m_fixef["Estimate"] - m_fixef["2.5_ci"]).values col_ub = (m_fixef["97.5_ci"] - m_fixef["Estimate"]).values elif error_bars == "se": col_lb, col_ub = m_fixef["SE"], m_fixef["SE"] # For seaborn m = pd.melt(m_ranef) _, ax = plt.subplots(1, 1, figsize=figsize) if ranef: alpha_plot = ranef_alpha else: alpha_plot = 0 if orient == "v": x_strip = "value" x_err = m_fixef["Estimate"] y_strip = "variable" y_err = range(m_fixef.shape[0]) xerr = [col_lb, col_ub] yerr = None ax.vlines( x=0, ymin=-1, ymax=self.coefs.shape[0], linestyles="--", color="grey" ) if not axlim: xlim = (m["value"].min() - 1, m["value"].max() + 1) else: xlim = axlim ylim = None else: y_strip = "value" y_err = m_fixef["Estimate"] x_strip = "variable" x_err = range(m_fixef.shape[0]) yerr = [col_lb, col_ub] xerr = None ax.hlines( y=0, xmin=-1, xmax=self.coefs.shape[0], linestyles="--", color="grey" ) if not axlim: ylim = (m["value"].min() - 1, m["value"].max() + 1) else: ylim = axlim xlim = None sns.stripplot( x=x_strip, y=y_strip, data=m, ax=ax, size=6, alpha=alpha_plot, color="grey" ) ax.errorbar( x=x_err, y=y_err, xerr=xerr, yerr=yerr, fmt=coef_fmt, capsize=0, elinewidth=4, color="black", ms=12, zorder=9999999999, ) ax.set(ylabel="", xlabel="Estimate", xlim=xlim, ylim=ylim) sns.despine(top=True, right=True, left=True) return ax
[docs] def plot( self, param, figsize=(8, 6), xlabel="", ylabel="", plot_fixef=True, plot_ci=True, grps=[], ax=None, ): """ Plot random and group level parameters from a fitted model Args: param (str): model parameter (column name) to plot figsize (tup): matplotlib desired figsize xlabel (str): x-axis label ylabel (str): y-axis label plot_fixef (bool): plot population effect fit of param?; default True plot_ci (bool): plot computed ci's of population effect?; default True grps (list): plot specific group fits only; must correspond to index values in model.fixef ax (matplotlib.axes.Axes): axis handle for an existing plot; if provided will ensure that random parameter plots appear *behind* all other plot objects. Returns: plt.axis: matplotlib axis handle """ if not self.fitted: raise RuntimeError("Model must be fit before plotting!") if self.factors: raise NotImplementedError( "Plotting can currently only handle models with continuous predictors!" ) if isinstance(self.fixef, list) or isinstance(self.ranef, list): raise NotImplementedError( "Plotting can currently only handle models with 1 random effect grouping variable!" ) if self.design_matrix is None: raise ValueError( "No fixed effects were estimated so prediction is not possible!" ) if not ax: _, ax = plt.subplots(1, 1, figsize=figsize) # Get range of unique values for desired parameter x_vals = self.design_matrix[param].unique() # Sort order to handle bug in matplotlib plotting idx = np.argsort(x_vals) # Get desired parameter part of the prediction fixef_pred = ( self.coefs.loc["(Intercept)", "Estimate"] + self.coefs.loc[param, "Estimate"] * x_vals ) fixef_pred_upper = ( self.coefs.loc["(Intercept)", "97.5_ci"] + self.coefs.loc[param, "97.5_ci"] * x_vals ) fixef_pred_lower = ( self.coefs.loc["(Intercept)", "2.5_ci"] + self.coefs.loc[param, "2.5_ci"] * x_vals ) if grps: if all(isinstance(x, int) for x in grps): ran_dat = self.fixef.iloc[grps, :] elif all(isinstance(x, str) for x in grps): ran_dat = self.fixef.loc[grps, :] else: raise TypeError( "grps must be integer list for integer-indexing (.iloc) of fixed effects, or label list for label-indexing (.loc) of fixed effects" ) else: ran_dat = self.fixef # Now generate random effects predictions for _, row in ran_dat.iterrows(): ranef_desired = row["(Intercept)"] + row[param] * x_vals # ranef_other = np.dot(other_vals_means, row.loc[other_vals]) pred = ranef_desired # + ranef_other ax.plot(x_vals[idx], pred[idx], "-", linewidth=2) if plot_fixef: ax.plot( x_vals[idx], fixef_pred[idx], "--", color="black", linewidth=3, zorder=9999999, ) if plot_ci: ax.fill_between( x_vals[idx], fixef_pred_lower[idx], fixef_pred_upper[idx], facecolor="black", alpha=0.25, zorder=9999998, ) ax.set( ylim=(self.data.fits.min(), self.data.fits.max()), xlim=(x_vals.min(), x_vals.max()), xlabel=param, ylabel=self.formula.split("~")[0].strip(), ) if xlabel: ax.set_xlabel(xlabel) if ylabel: ax.set_ylabel(ylabel) return ax
[docs] def confint( self, parm=None, level=0.95, method="Wald", zeta=None, nsim=500, boot_type="perc", quiet=False, oldnames=False, seed=None, ): """ Compute confidence intervals on the parameters of a Lmer object (this is a wrapper for confint.merMod in lme4). Args: self (Lmer): the Lmer object for which confidence intervals should be computed parm (list of str): parameter names for which intervals are sought. Specified by an integer vector of positions (leave blank to compute ci for all parameters) level (float): confidence level <1, typically above 0.90 method (str): which method to compute confidence intervals; 'profile', 'Wald' (default), or 'boot' (parametric bootstrap) zeta (float): (for method = "profile" only:) likelihood cutoff (if not specified, as by default, computed from level in R). nsim (int): number of bootstrap intervals if bootstrapped confidence intervals are requests; default 500 boot_type (str): bootstrap confidence interval type (one of "perc","basic","norm", as defined in boot_ci in R) quiet (bool): (logical) suppress messages about computationally intensive profiling? oldnames: (logical) use old-style names for variance-covariance parameters, e.g. ".sig01", rather than newer (more informative) names such as "sd_(Intercept)|Subject"? seed (int): seed to be passed to bootMer for repeatability. Returns: pd.DataFrame: confidence intervals for the parameters of interest Examples: The following examples demonstrate how to get different types of confidence intervals. The default Wald estimates for all parameters >>> model.confint() Boostrap estimate for the variance component of the random intercept from factor "Group" and the error variance. You will need more than 100 repeats for a real application >>> model.confint(method="boot", nsim=100, parm=["sd_(Intercept)|Group","sigma"]) Same as above but using oldnames for those variances, which is still the devault in lme4 >>> model.confint(method="boot", nsim=100, oldnames=True, parm=[".sig01",".sigma"]) """ # record messages going to screen r_console = [] def _f(x): r_console.append(x) callbacks.consolewrite_warnerror = _f # create string for parm parm_string = "" if parm is not None: parm_string = """,parm=c('""" for i, this_parm in enumerate(parm): if i != 0: parm_string = parm_string + """,'""" parm_string = parm_string + this_parm + """'""" parm_string = parm_string + """)""" # create string of command rstring = ( """ function(model){ out_ci <- as.data.frame(confint(model""" + parm_string + """,level=""" + str(level) + """,method='""" + method + """'""" + ((""",zeta=""" + str(zeta)) if zeta is not None else """""") + ((""",seed=""" + str(seed)) if seed is not None else """""") + """,nsim=""" + str(nsim) + """,boot.type='""" + boot_type + """'""" + """,quiet=""" + ("""TRUE""" if quiet else """FALSE""") + """,oldNames=""" + ("""TRUE""" if oldnames else """FALSE""") + """)) out_ci }""" ) confint_func = robjects.r(rstring) out_summary = confint_func(self.model_obj) out_summary = R2pandas(out_summary) # restore outputs callbacks.consolewrite_warnerror = consolewrite_warning_backup for message in r_console: print(message) return out_summary