""" Test functions for GEE External comparisons are to R and Stata. The statsmodels GEE implementation should generally agree with the R GEE implementation for the independence and exchangeable correlation structures. For other correlation structures, the details of the correlation estimation differ among implementations and the results will not agree exactly. """ from statsmodels.compat import lrange import os import numpy as np import pytest from numpy.testing import (assert_almost_equal, assert_equal, assert_allclose, assert_array_less, assert_raises, assert_warns, assert_) import statsmodels.genmod.generalized_estimating_equations as gee import statsmodels.tools as tools import statsmodels.regression.linear_model as lm from statsmodels.genmod import families from statsmodels.genmod import cov_struct import statsmodels.discrete.discrete_model as discrete import pandas as pd from scipy.stats.distributions import norm import warnings try: import matplotlib.pyplot as plt except ImportError: pass pdf_output = False if pdf_output: from matplotlib.backends.backend_pdf import PdfPages pdf = PdfPages("test_glm.pdf") else: pdf = None def close_or_save(pdf, fig): if pdf_output: pdf.savefig(fig) def load_data(fname, icept=True): """ Load a data set from the results directory. The data set should be a CSV file with the following format: Column 0: Group indicator Column 1: endog variable Columns 2-end: exog variables If `icept` is True, an intercept is prepended to the exog variables. """ cur_dir = os.path.dirname(os.path.abspath(__file__)) Z = np.genfromtxt(os.path.join(cur_dir, 'results', fname), delimiter=",") group = Z[:, 0] endog = Z[:, 1] exog = Z[:, 2:] if icept: exog = np.concatenate((np.ones((exog.shape[0], 1)), exog), axis=1) return endog, exog, group def check_wrapper(results): # check wrapper assert_(isinstance(results.params, pd.Series)) assert_(isinstance(results.fittedvalues, pd.Series)) assert_(isinstance(results.resid, pd.Series)) assert_(isinstance(results.centered_resid, pd.Series)) assert_(isinstance(results._results.params, np.ndarray)) assert_(isinstance(results._results.fittedvalues, np.ndarray)) assert_(isinstance(results._results.resid, np.ndarray)) assert_(isinstance(results._results.centered_resid, np.ndarray)) class TestGEE(object): def test_margins_gaussian(self): # Check marginal effects for a Gaussian GEE fit. Marginal # effects and ordinary effects should be equal. n = 40 np.random.seed(34234) exog = np.random.normal(size=(n, 3)) exog[:, 0] = 1 groups = np.kron(np.arange(n / 4), np.r_[1, 1, 1, 1]) endog = exog[:, 1] + np.random.normal(size=n) model = gee.GEE(endog, exog, groups) result = model.fit( start_params=[-4.88085602e-04, 1.18501903, 4.78820100e-02]) marg = result.get_margeff() assert_allclose(marg.margeff, result.params[1:]) assert_allclose(marg.margeff_se, result.bse[1:]) # smoke test marg.summary() def test_margins_logistic(self): # Check marginal effects for a binomial GEE fit. Comparison # comes from Stata. np.random.seed(34234) endog = np.r_[0, 0, 0, 0, 1, 1, 1, 1] exog = np.ones((8, 2)) exog[:, 1] = np.r_[1, 2, 1, 1, 2, 1, 2, 2] groups = np.arange(8) model = gee.GEE(endog, exog, groups, family=families.Binomial()) result = model.fit( cov_type='naive', start_params=[-3.29583687, 2.19722458]) marg = result.get_margeff() assert_allclose(marg.margeff, np.r_[0.4119796]) assert_allclose(marg.margeff_se, np.r_[0.1379962], rtol=1e-6) def test_margins_multinomial(self): # Check marginal effects for a 2-class multinomial GEE fit, # which should be equivalent to logistic regression. Comparison # comes from Stata. np.random.seed(34234) endog = np.r_[0, 0, 0, 0, 1, 1, 1, 1] exog = np.ones((8, 2)) exog[:, 1] = np.r_[1, 2, 1, 1, 2, 1, 2, 2] groups = np.arange(8) model = gee.NominalGEE(endog, exog, groups) result = model.fit(cov_type='naive', start_params=[ 3.295837, -2.197225]) marg = result.get_margeff() assert_allclose(marg.margeff, np.r_[-0.41197961], rtol=1e-5) assert_allclose(marg.margeff_se, np.r_[0.1379962], rtol=1e-6) @pytest.mark.smoke @pytest.mark.matplotlib def test_nominal_plot(self, close_figures): np.random.seed(34234) endog = np.r_[0, 0, 0, 0, 1, 1, 1, 1] exog = np.ones((8, 2)) exog[:, 1] = np.r_[1, 2, 1, 1, 2, 1, 2, 2] groups = np.arange(8) model = gee.NominalGEE(endog, exog, groups) result = model.fit(cov_type='naive', start_params=[3.295837, -2.197225]) fig = result.plot_distribution() assert_equal(isinstance(fig, plt.Figure), True) def test_margins_poisson(self): # Check marginal effects for a Poisson GEE fit. np.random.seed(34234) endog = np.r_[10, 15, 12, 13, 20, 18, 26, 29] exog = np.ones((8, 2)) exog[:, 1] = np.r_[0, 0, 0, 0, 1, 1, 1, 1] groups = np.arange(8) model = gee.GEE(endog, exog, groups, family=families.Poisson()) result = model.fit(cov_type='naive', start_params=[ 2.52572864, 0.62057649]) marg = result.get_margeff() assert_allclose(marg.margeff, np.r_[11.0928], rtol=1e-6) assert_allclose(marg.margeff_se, np.r_[3.269015], rtol=1e-6) def test_multinomial(self): """ Check the 2-class multinomial (nominal) GEE fit against logistic regression. """ np.random.seed(34234) endog = np.r_[0, 0, 0, 0, 1, 1, 1, 1] exog = np.ones((8, 2)) exog[:, 1] = np.r_[1, 2, 1, 1, 2, 1, 2, 2] groups = np.arange(8) model = gee.NominalGEE(endog, exog, groups) results = model.fit(cov_type='naive', start_params=[ 3.295837, -2.197225]) logit_model = gee.GEE(endog, exog, groups, family=families.Binomial()) logit_results = logit_model.fit(cov_type='naive') assert_allclose(results.params, -logit_results.params, rtol=1e-5) assert_allclose(results.bse, logit_results.bse, rtol=1e-5) def test_weighted(self): # Simple check where the answer can be computed by hand. exog = np.ones(20) weights = np.ones(20) weights[0:10] = 2 endog = np.zeros(20) endog[0:10] += 1 groups = np.kron(np.arange(10), np.r_[1, 1]) model = gee.GEE(endog, exog, groups, weights=weights) result = model.fit() assert_allclose(result.params, np.r_[2 / 3.]) # Comparison against stata using groups with different sizes. weights = np.ones(20) weights[10:] = 2 endog = np.r_[1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 6, 7, 8, 7, 8] exog1 = np.r_[1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3] groups = np.r_[1, 1, 2, 2, 2, 2, 4, 4, 5, 5, 6, 6, 6, 6, 8, 8, 9, 9, 10, 10] exog = np.column_stack((np.ones(20), exog1)) # Comparison using independence model model = gee.GEE(endog, exog, groups, weights=weights, cov_struct=cov_struct.Independence()) g = np.mean([2, 4, 2, 2, 4, 2, 2, 2]) fac = 20 / float(20 - g) result = model.fit(ddof_scale=0, scaling_factor=fac) assert_allclose(result.params, np.r_[1.247573, 1.436893], atol=1e-6) assert_allclose(result.scale, 1.808576) # Stata multiples robust SE by sqrt(N / (N - g)), where N is # the total sample size and g is the average group size. assert_allclose(result.bse, np.r_[0.895366, 0.3425498], atol=1e-5) # Comparison using exchangeable model # Smoke test for now model = gee.GEE(endog, exog, groups, weights=weights, cov_struct=cov_struct.Exchangeable()) model.fit(ddof_scale=0) # This is in the release announcement for version 0.6. def test_poisson_epil(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) fname = os.path.join(cur_dir, "results", "epil.csv") data = pd.read_csv(fname) fam = families.Poisson() ind = cov_struct.Independence() mod1 = gee.GEE.from_formula("y ~ age + trt + base", data["subject"], data, cov_struct=ind, family=fam) rslt1 = mod1.fit(cov_type='naive') # Coefficients should agree with GLM from statsmodels.genmod.generalized_linear_model import GLM mod2 = GLM.from_formula("y ~ age + trt + base", data, family=families.Poisson()) rslt2 = mod2.fit() # do not use wrapper, asserts_xxx do not work rslt1 = rslt1._results rslt2 = rslt2._results assert_allclose(rslt1.params, rslt2.params, rtol=1e-6, atol=1e-6) assert_allclose(rslt1.bse, rslt2.bse, rtol=1e-6, atol=1e-6) def test_missing(self): # Test missing data handling for calling from the api. Missing # data handling does not currently work for formulas. np.random.seed(34234) endog = np.random.normal(size=100) exog = np.random.normal(size=(100, 3)) exog[:, 0] = 1 groups = np.kron(lrange(20), np.ones(5)) endog[0] = np.nan endog[5:7] = np.nan exog[10:12, 1] = np.nan mod1 = gee.GEE(endog, exog, groups, missing='drop') rslt1 = mod1.fit() assert_almost_equal(len(mod1.endog), 95) assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 3]) ii = np.isfinite(endog) & np.isfinite(exog).all(1) mod2 = gee.GEE(endog[ii], exog[ii, :], groups[ii], missing='none') rslt2 = mod2.fit() assert_almost_equal(rslt1.params, rslt2.params) assert_almost_equal(rslt1.bse, rslt2.bse) def test_missing_formula(self): # Test missing data handling for formulas. np.random.seed(34234) endog = np.random.normal(size=100) exog1 = np.random.normal(size=100) exog2 = np.random.normal(size=100) exog3 = np.random.normal(size=100) groups = np.kron(lrange(20), np.ones(5)) endog[0] = np.nan endog[5:7] = np.nan exog2[10:12] = np.nan data0 = pd.DataFrame({"endog": endog, "exog1": exog1, "exog2": exog2, "exog3": exog3, "groups": groups}) for k in 0, 1: data = data0.copy() kwargs = {} if k == 1: data["offset"] = 0 data["time"] = 0 kwargs["offset"] = "offset" kwargs["time"] = "time" mod1 = gee.GEE.from_formula("endog ~ exog1 + exog2 + exog3", groups="groups", data=data, missing='drop', **kwargs) rslt1 = mod1.fit() assert_almost_equal(len(mod1.endog), 95) assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 4]) data = data.dropna() kwargs = {} if k == 1: kwargs["offset"] = data["offset"] kwargs["time"] = data["time"] mod2 = gee.GEE.from_formula("endog ~ exog1 + exog2 + exog3", groups=data["groups"], data=data, missing='none', **kwargs) rslt2 = mod2.fit() assert_almost_equal(rslt1.params.values, rslt2.params.values) assert_almost_equal(rslt1.bse.values, rslt2.bse.values) @pytest.mark.parametrize("k1", [False, True]) @pytest.mark.parametrize("k2", [False, True]) def test_invalid_args(self, k1, k2): for j in range(3): p = [20, 20, 20] p[j] = 18 endog = np.zeros(p[0]) exog = np.zeros((p[1], 2)) kwargs = {} kwargs["groups"] = np.zeros(p[2]) if k1: kwargs["exposure"] = np.zeros(18) if k2: kwargs["time"] = np.zeros(18) with assert_raises(ValueError): gee.GEE(endog, exog, **kwargs) def test_default_time(self): # Check that the time defaults work correctly. endog, exog, group = load_data("gee_logistic_1.csv") # Time values for the autoregressive model T = np.zeros(len(endog)) idx = set(group) for ii in idx: jj = np.flatnonzero(group == ii) T[jj] = lrange(len(jj)) family = families.Binomial() va = cov_struct.Autoregressive(grid=False) md1 = gee.GEE(endog, exog, group, family=family, cov_struct=va) mdf1 = md1.fit() md2 = gee.GEE(endog, exog, group, time=T, family=family, cov_struct=va) mdf2 = md2.fit() assert_almost_equal(mdf1.params, mdf2.params, decimal=6) assert_almost_equal(mdf1.standard_errors(), mdf2.standard_errors(), decimal=6) def test_logistic(self): # R code for comparing results: # library(gee) # Z = read.csv("results/gee_logistic_1.csv", header=FALSE) # Y = Z[,2] # Id = Z[,1] # X1 = Z[,3] # X2 = Z[,4] # X3 = Z[,5] # mi = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, # corstr="independence") # smi = summary(mi) # u = coefficients(smi) # cfi = paste(u[,1], collapse=",") # sei = paste(u[,4], collapse=",") # me = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, # corstr="exchangeable") # sme = summary(me) # u = coefficients(sme) # cfe = paste(u[,1], collapse=",") # see = paste(u[,4], collapse=",") # ma = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, # corstr="AR-M") # sma = summary(ma) # u = coefficients(sma) # cfa = paste(u[,1], collapse=",") # sea = paste(u[,4], collapse=",") # sprintf("cf = [[%s],[%s],[%s]]", cfi, cfe, cfa) # sprintf("se = [[%s],[%s],[%s]]", sei, see, sea) endog, exog, group = load_data("gee_logistic_1.csv") # Time values for the autoregressive model T = np.zeros(len(endog)) idx = set(group) for ii in idx: jj = np.flatnonzero(group == ii) T[jj] = lrange(len(jj)) family = families.Binomial() ve = cov_struct.Exchangeable() vi = cov_struct.Independence() va = cov_struct.Autoregressive(grid=False) # From R gee cf = [[0.0167272965285882, 1.13038654425893, -1.86896345082962, 1.09397608331333], [0.0178982283915449, 1.13118798191788, -1.86133518416017, 1.08944256230299], [0.0109621937947958, 1.13226505028438, -1.88278757333046, 1.09954623769449]] se = [[0.127291720283049, 0.166725808326067, 0.192430061340865, 0.173141068839597], [0.127045031730155, 0.165470678232842, 0.192052750030501, 0.173174779369249], [0.127240302296444, 0.170554083928117, 0.191045527104503, 0.169776150974586]] for j, v in enumerate((vi, ve, va)): md = gee.GEE(endog, exog, group, T, family, v) mdf = md.fit() if id(v) != id(va): assert_almost_equal(mdf.params, cf[j], decimal=6) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6) # Test with formulas D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]), axis=1) D = pd.DataFrame(D) D.columns = ["Y", "Id", ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)] for j, v in enumerate((vi, ve)): md = gee.GEE.from_formula("Y ~ X1 + X2 + X3", "Id", D, family=family, cov_struct=v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=6) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6) # FIXME: do not leave commented-out # Check for run-time exceptions in summary # print(mdf.summary()) def test_autoregressive(self): dep_params_true = [0, 0.589208623896, 0.559823804948] params_true = [[1.08043787, 1.12709319, 0.90133927], [0.9613677, 1.05826987, 0.90832055], [1.05370439, 0.96084864, 0.93923374]] np.random.seed(342837482) num_group = 100 ar_param = 0.5 k = 3 ga = families.Gaussian() for gsize in 1, 2, 3: ix = np.arange(gsize)[:, None] - np.arange(gsize)[None, :] ix = np.abs(ix) cmat = ar_param ** ix cmat_r = np.linalg.cholesky(cmat) endog = [] exog = [] groups = [] for i in range(num_group): x = np.random.normal(size=(gsize, k)) exog.append(x) expval = x.sum(1) errors = np.dot(cmat_r, np.random.normal(size=gsize)) endog.append(expval + errors) groups.append(i * np.ones(gsize)) endog = np.concatenate(endog) groups = np.concatenate(groups) exog = np.concatenate(exog, axis=0) ar = cov_struct.Autoregressive(grid=False) md = gee.GEE(endog, exog, groups, family=ga, cov_struct=ar) mdf = md.fit() assert_almost_equal(ar.dep_params, dep_params_true[gsize - 1]) assert_almost_equal(mdf.params, params_true[gsize - 1]) def test_post_estimation(self): family = families.Gaussian() endog, exog, group = load_data("gee_linear_1.csv") ve = cov_struct.Exchangeable() md = gee.GEE(endog, exog, group, None, family, ve) mdf = md.fit() assert_almost_equal(np.dot(exog, mdf.params), mdf.fittedvalues) assert_almost_equal(endog - np.dot(exog, mdf.params), mdf.resid) def test_scoretest(self): # Regression tests np.random.seed(6432) n = 200 # Must be divisible by 4 exog = np.random.normal(size=(n, 4)) endog = exog[:, 0] + exog[:, 1] + exog[:, 2] endog += 3 * np.random.normal(size=n) group = np.kron(np.arange(n / 4), np.ones(4)) # Test under the null. L = np.array([[1., -1, 0, 0]]) R = np.array([0., ]) family = families.Gaussian() va = cov_struct.Independence() mod1 = gee.GEE(endog, exog, group, family=family, cov_struct=va, constraint=(L, R)) res1 = mod1.fit() assert_almost_equal(res1.score_test()["statistic"], 1.08126334) assert_almost_equal(res1.score_test()["p-value"], 0.2984151086) # Test under the alternative. L = np.array([[1., -1, 0, 0]]) R = np.array([1.0, ]) family = families.Gaussian() va = cov_struct.Independence() mod2 = gee.GEE(endog, exog, group, family=family, cov_struct=va, constraint=(L, R)) res2 = mod2.fit() assert_almost_equal(res2.score_test()["statistic"], 3.491110965) assert_almost_equal(res2.score_test()["p-value"], 0.0616991659) # Compare to Wald tests exog = np.random.normal(size=(n, 2)) L = np.array([[1, -1]]) R = np.array([0.]) f = np.r_[1, -1] for i in range(10): endog = exog[:, 0] + (0.5 + i / 10.) * exog[:, 1] +\ np.random.normal(size=n) family = families.Gaussian() va = cov_struct.Independence() mod0 = gee.GEE(endog, exog, group, family=family, cov_struct=va) rslt0 = mod0.fit() family = families.Gaussian() va = cov_struct.Independence() mod1 = gee.GEE(endog, exog, group, family=family, cov_struct=va, constraint=(L, R)) res1 = mod1.fit() se = np.sqrt(np.dot(f, np.dot(rslt0.cov_params(), f))) wald_z = np.dot(f, rslt0.params) / se wald_p = 2 * norm.cdf(-np.abs(wald_z)) score_p = res1.score_test()["p-value"] assert_array_less(np.abs(wald_p - score_p), 0.02) @pytest.mark.parametrize("cov_struct", [cov_struct.Independence, cov_struct.Exchangeable]) def test_compare_score_test(self, cov_struct): np.random.seed(6432) n = 200 # Must be divisible by 4 exog = np.random.normal(size=(n, 4)) group = np.kron(np.arange(n / 4), np.ones(4)) exog_sub = exog[:, [0, 3]] endog = exog_sub.sum(1) + 3 * np.random.normal(size=n) L = np.asarray([[0, 1, 0, 0], [0, 0, 1, 0]]) R = np.zeros(2) mod_lr = gee.GEE(endog, exog, group, constraint=(L, R), cov_struct=cov_struct()) mod_lr.fit() mod_sub = gee.GEE(endog, exog_sub, group, cov_struct=cov_struct()) res_sub = mod_sub.fit() for call_fit in [False, True]: mod = gee.GEE(endog, exog, group, cov_struct=cov_struct()) if call_fit: # Should work with or without fitting the parent model mod.fit() score_results = mod.compare_score_test(res_sub) assert_almost_equal( score_results["statistic"], mod_lr.score_test_results["statistic"]) assert_almost_equal( score_results["p-value"], mod_lr.score_test_results["p-value"]) assert_almost_equal( score_results["df"], mod_lr.score_test_results["df"]) def test_compare_score_test_warnings(self): np.random.seed(6432) n = 200 # Must be divisible by 4 exog = np.random.normal(size=(n, 4)) group = np.kron(np.arange(n / 4), np.ones(4)) exog_sub = exog[:, [0, 3]] endog = exog_sub.sum(1) + 3 * np.random.normal(size=n) # Mismatched cov_struct with assert_warns(UserWarning): mod_sub = gee.GEE(endog, exog_sub, group, cov_struct=cov_struct.Exchangeable()) res_sub = mod_sub.fit() mod = gee.GEE(endog, exog, group, cov_struct=cov_struct.Independence()) mod.compare_score_test(res_sub) # smoketest # Mismatched family with assert_warns(UserWarning): mod_sub = gee.GEE(endog, exog_sub, group, family=families.Gaussian()) res_sub = mod_sub.fit() mod = gee.GEE(endog, exog, group, family=families.Poisson()) mod.compare_score_test(res_sub) # smoketest # Mismatched size with assert_raises(Exception): mod_sub = gee.GEE(endog, exog_sub, group) res_sub = mod_sub.fit() mod = gee.GEE(endog[0:100], exog[:100, :], group[0:100]) mod.compare_score_test(res_sub) # smoketest # Mismatched weights with assert_warns(UserWarning): w = np.random.uniform(size=n) mod_sub = gee.GEE(endog, exog_sub, group, weights=w) res_sub = mod_sub.fit() mod = gee.GEE(endog, exog, group) mod.compare_score_test(res_sub) # smoketest # Parent and submodel are the same dimension with pytest.warns(UserWarning): w = np.random.uniform(size=n) mod_sub = gee.GEE(endog, exog, group) res_sub = mod_sub.fit() mod = gee.GEE(endog, exog, group) mod.compare_score_test(res_sub) # smoketest def test_constraint_covtype(self): # Test constraints with different cov types np.random.seed(6432) n = 200 exog = np.random.normal(size=(n, 4)) endog = exog[:, 0] + exog[:, 1] + exog[:, 2] endog += 3 * np.random.normal(size=n) group = np.kron(np.arange(n / 4), np.ones(4)) L = np.array([[1., -1, 0, 0]]) R = np.array([0., ]) family = families.Gaussian() va = cov_struct.Independence() for cov_type in "robust", "naive", "bias_reduced": model = gee.GEE(endog, exog, group, family=family, cov_struct=va, constraint=(L, R)) result = model.fit(cov_type=cov_type) result.standard_errors(cov_type=cov_type) assert_allclose(result.cov_robust.shape, np.r_[4, 4]) assert_allclose(result.cov_naive.shape, np.r_[4, 4]) if cov_type == "bias_reduced": assert_allclose(result.cov_robust_bc.shape, np.r_[4, 4]) def test_linear(self): # library(gee) # Z = read.csv("results/gee_linear_1.csv", header=FALSE) # Y = Z[,2] # Id = Z[,1] # X1 = Z[,3] # X2 = Z[,4] # X3 = Z[,5] # mi = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian, # corstr="independence", tol=1e-8, maxit=100) # smi = summary(mi) # u = coefficients(smi) # cfi = paste(u[,1], collapse=",") # sei = paste(u[,4], collapse=",") # me = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian, # corstr="exchangeable", tol=1e-8, maxit=100) # sme = summary(me) # u = coefficients(sme) # cfe = paste(u[,1], collapse=",") # see = paste(u[,4], collapse=",") # sprintf("cf = [[%s],[%s]]", cfi, cfe) # sprintf("se = [[%s],[%s]]", sei, see) family = families.Gaussian() endog, exog, group = load_data("gee_linear_1.csv") vi = cov_struct.Independence() ve = cov_struct.Exchangeable() # From R gee cf = [[-0.01850226507491, 0.81436304278962, -1.56167635393184, 0.794239361055003], [-0.0182920577154767, 0.814898414022467, -1.56194040106201, 0.793499517527478]] se = [[0.0440733554189401, 0.0479993639119261, 0.0496045952071308, 0.0479467597161284], [0.0440369906460754, 0.0480069787567662, 0.049519758758187, 0.0479760443027526]] for j, v in enumerate((vi, ve)): md = gee.GEE(endog, exog, group, None, family, v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=10) assert_almost_equal(mdf.standard_errors(), se[j], decimal=10) # Test with formulas D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]), axis=1) D = pd.DataFrame(D) D.columns = ["Y", "Id", ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)] for j, v in enumerate((vi, ve)): md = gee.GEE.from_formula("Y ~ X1 + X2 + X3", "Id", D, family=family, cov_struct=v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=10) assert_almost_equal(mdf.standard_errors(), se[j], decimal=10) def test_linear_constrained(self): family = families.Gaussian() np.random.seed(34234) exog = np.random.normal(size=(300, 4)) exog[:, 0] = 1 endog = np.dot(exog, np.r_[1, 1, 0, 0.2]) +\ np.random.normal(size=300) group = np.kron(np.arange(100), np.r_[1, 1, 1]) vi = cov_struct.Independence() ve = cov_struct.Exchangeable() L = np.r_[[[0, 0, 0, 1]]] R = np.r_[0, ] for j, v in enumerate((vi, ve)): md = gee.GEE(endog, exog, group, None, family, v, constraint=(L, R)) mdf = md.fit() assert_almost_equal(mdf.params[3], 0, decimal=10) def test_nested_linear(self): family = families.Gaussian() endog, exog, group = load_data("gee_nested_linear_1.csv") group_n = [] for i in range(endog.shape[0] // 10): group_n.extend([0, ] * 5) group_n.extend([1, ] * 5) group_n = np.array(group_n)[:, None] dp = cov_struct.Independence() md = gee.GEE(endog, exog, group, None, family, dp) mdf1 = md.fit() # From statsmodels.GEE (not an independent test) cf = np.r_[-0.1671073, 1.00467426, -2.01723004, 0.97297106] se = np.r_[0.08629606, 0.04058653, 0.04067038, 0.03777989] assert_almost_equal(mdf1.params, cf, decimal=6) assert_almost_equal(mdf1.standard_errors(), se, decimal=6) ne = cov_struct.Nested() md = gee.GEE(endog, exog, group, None, family, ne, dep_data=group_n) mdf2 = md.fit(start_params=mdf1.params) # From statsmodels.GEE (not an independent test) cf = np.r_[-0.16655319, 1.02183688, -2.00858719, 1.00101969] se = np.r_[0.08632616, 0.02913582, 0.03114428, 0.02893991] assert_almost_equal(mdf2.params, cf, decimal=6) assert_almost_equal(mdf2.standard_errors(), se, decimal=6) smry = mdf2.cov_struct.summary() assert_allclose( smry.Variance, np.r_[1.043878, 0.611656, 1.421205], atol=1e-5, rtol=1e-5) def test_nested_pandas(self): np.random.seed(4234) n = 10000 # Outer groups groups = np.kron(np.arange(n // 100), np.ones(100)).astype(int) # Inner groups groups1 = np.kron(np.arange(n // 50), np.ones(50)).astype(int) groups2 = np.kron(np.arange(n // 10), np.ones(10)).astype(int) # Group effects groups_e = np.random.normal(size=n // 100) groups1_e = 2 * np.random.normal(size=n // 50) groups2_e = 3 * np.random.normal(size=n // 10) y = groups_e[groups] + groups1_e[groups1] + groups2_e[groups2] y += 0.5 * np.random.normal(size=n) df = pd.DataFrame({"y": y, "TheGroups": groups, "groups1": groups1, "groups2": groups2}) model = gee.GEE.from_formula("y ~ 1", groups="TheGroups", dep_data="0 + groups1 + groups2", cov_struct=cov_struct.Nested(), data=df) result = model.fit() # The true variances are 1, 4, 9, 0.25 smry = result.cov_struct.summary() assert_allclose( smry.Variance, np.r_[1.437299, 4.421543, 8.905295, 0.258480], atol=1e-5, rtol=1e-5) def test_ordinal(self): family = families.Binomial() endog, exog, groups = load_data("gee_ordinal_1.csv", icept=False) va = cov_struct.GlobalOddsRatio("ordinal") mod = gee.OrdinalGEE(endog, exog, groups, None, family, va) rslt = mod.fit() # Regression test cf = np.r_[1.09250002, 0.0217443, -0.39851092, -0.01812116, 0.03023969, 1.18258516, 0.01803453, -1.10203381] assert_almost_equal(rslt.params, cf, decimal=5) # Regression test se = np.r_[0.10883461, 0.10330197, 0.11177088, 0.05486569, 0.05997153, 0.09168148, 0.05953324, 0.0853862] assert_almost_equal(rslt.bse, se, decimal=5) # Check that we get the correct results type assert_equal(type(rslt), gee.OrdinalGEEResultsWrapper) assert_equal(type(rslt._results), gee.OrdinalGEEResults) @pytest.mark.smoke def test_ordinal_formula(self): np.random.seed(434) n = 40 y = np.random.randint(0, 3, n) groups = np.arange(n) x1 = np.random.normal(size=n) x2 = np.random.normal(size=n) df = pd.DataFrame({"y": y, "groups": groups, "x1": x1, "x2": x2}) model = gee.OrdinalGEE.from_formula("y ~ 0 + x1 + x2", groups, data=df) model.fit() with warnings.catch_warnings(): warnings.simplefilter("ignore") model = gee.NominalGEE.from_formula("y ~ 0 + x1 + x2", groups, data=df) model.fit() @pytest.mark.smoke def test_ordinal_independence(self): np.random.seed(434) n = 40 y = np.random.randint(0, 3, n) groups = np.kron(np.arange(n / 2), np.r_[1, 1]) x = np.random.normal(size=(n, 1)) odi = cov_struct.OrdinalIndependence() model1 = gee.OrdinalGEE(y, x, groups, cov_struct=odi) model1.fit() @pytest.mark.smoke def test_nominal_independence(self): np.random.seed(434) n = 40 y = np.random.randint(0, 3, n) groups = np.kron(np.arange(n / 2), np.r_[1, 1]) x = np.random.normal(size=(n, 1)) with warnings.catch_warnings(): warnings.simplefilter("ignore") nmi = cov_struct.NominalIndependence() model1 = gee.NominalGEE(y, x, groups, cov_struct=nmi) model1.fit() @pytest.mark.smoke @pytest.mark.matplotlib def test_ordinal_plot(self, close_figures): family = families.Binomial() endog, exog, groups = load_data("gee_ordinal_1.csv", icept=False) va = cov_struct.GlobalOddsRatio("ordinal") mod = gee.OrdinalGEE(endog, exog, groups, None, family, va) rslt = mod.fit() fig = rslt.plot_distribution() assert_equal(isinstance(fig, plt.Figure), True) def test_nominal(self): endog, exog, groups = load_data("gee_nominal_1.csv", icept=False) # Test with independence correlation va = cov_struct.Independence() mod1 = gee.NominalGEE(endog, exog, groups, cov_struct=va) rslt1 = mod1.fit() # Regression test cf1 = np.r_[0.450009, 0.451959, -0.918825, -0.468266] se1 = np.r_[0.08915936, 0.07005046, 0.12198139, 0.08281258] assert_allclose(rslt1.params, cf1, rtol=1e-5, atol=1e-5) assert_allclose(rslt1.standard_errors(), se1, rtol=1e-5, atol=1e-5) # Test with global odds ratio dependence va = cov_struct.GlobalOddsRatio("nominal") mod2 = gee.NominalGEE(endog, exog, groups, cov_struct=va) rslt2 = mod2.fit(start_params=rslt1.params) # Regression test cf2 = np.r_[0.455365, 0.415334, -0.916589, -0.502116] se2 = np.r_[0.08803614, 0.06628179, 0.12259726, 0.08411064] assert_allclose(rslt2.params, cf2, rtol=1e-5, atol=1e-5) assert_allclose(rslt2.standard_errors(), se2, rtol=1e-5, atol=1e-5) # Make sure we get the correct results type assert_equal(type(rslt1), gee.NominalGEEResultsWrapper) assert_equal(type(rslt1._results), gee.NominalGEEResults) def test_poisson(self): # library(gee) # Z = read.csv("results/gee_poisson_1.csv", header=FALSE) # Y = Z[,2] # Id = Z[,1] # X1 = Z[,3] # X2 = Z[,4] # X3 = Z[,5] # X4 = Z[,6] # X5 = Z[,7] # mi = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson, # corstr="independence", scale.fix=TRUE) # smi = summary(mi) # u = coefficients(smi) # cfi = paste(u[,1], collapse=",") # sei = paste(u[,4], collapse=",") # me = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson, # corstr="exchangeable", scale.fix=TRUE) # sme = summary(me) # u = coefficients(sme) # cfe = paste(u[,1], collapse=",") # see = paste(u[,4], collapse=",") # sprintf("cf = [[%s],[%s]]", cfi, cfe) # sprintf("se = [[%s],[%s]]", sei, see) family = families.Poisson() endog, exog, group_n = load_data("gee_poisson_1.csv") vi = cov_struct.Independence() ve = cov_struct.Exchangeable() # From R gee cf = [[-0.0364450410793481, -0.0543209391301178, 0.0156642711741052, 0.57628591338724, -0.00465659951186211, -0.477093153099256], [-0.0315615554826533, -0.0562589480840004, 0.0178419412298561, 0.571512795340481, -0.00363255566297332, -0.475971696727736]] se = [[0.0611309237214186, 0.0390680524493108, 0.0334234174505518, 0.0366860768962715, 0.0304758505008105, 0.0316348058881079], [0.0610840153582275, 0.0376887268649102, 0.0325168379415177, 0.0369786751362213, 0.0296141014225009, 0.0306115470200955]] for j, v in enumerate((vi, ve)): md = gee.GEE(endog, exog, group_n, None, family, v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=5) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6) # Test with formulas D = np.concatenate((endog[:, None], group_n[:, None], exog[:, 1:]), axis=1) D = pd.DataFrame(D) D.columns = ["Y", "Id", ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)] for j, v in enumerate((vi, ve)): md = gee.GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5", "Id", D, family=family, cov_struct=v) mdf = md.fit() assert_almost_equal(mdf.params, cf[j], decimal=5) assert_almost_equal(mdf.standard_errors(), se[j], decimal=6) # print(mdf.params) def test_groups(self): # Test various group structures (nonconsecutive, different # group sizes, not ordered, string labels) np.random.seed(234) n = 40 x = np.random.normal(size=(n, 2)) y = np.random.normal(size=n) # groups with unequal group sizes groups = np.kron(np.arange(n / 4), np.ones(4)) groups[8:12] = 3 groups[34:36] = 9 model1 = gee.GEE(y, x, groups=groups) result1 = model1.fit() # Unordered groups ix = np.random.permutation(n) y1 = y[ix] x1 = x[ix, :] groups1 = groups[ix] model2 = gee.GEE(y1, x1, groups=groups1) result2 = model2.fit() assert_allclose(result1.params, result2.params) assert_allclose(result1.tvalues, result2.tvalues) # group labels are strings mp = {} import string for j, g in enumerate(set(groups)): mp[g] = string.ascii_letters[j:j + 4] groups2 = [mp[g] for g in groups] model3 = gee.GEE(y, x, groups=groups2) result3 = model3.fit() assert_allclose(result1.params, result3.params) assert_allclose(result1.tvalues, result3.tvalues) def test_compare_OLS(self): # Gaussian GEE with independence correlation should agree # exactly with OLS for parameter estimates and standard errors # derived from the naive covariance estimate. vs = cov_struct.Independence() family = families.Gaussian() np.random.seed(34234) Y = np.random.normal(size=100) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.kron(lrange(20), np.ones(5)) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) md = gee.GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) mdf = md.fit() ols = lm.OLS.from_formula("Y ~ X1 + X2 + X3", data=D).fit() # do not use wrapper, asserts_xxx do not work ols = ols._results assert_almost_equal(ols.params, mdf.params, decimal=10) se = mdf.standard_errors(cov_type="naive") assert_almost_equal(ols.bse, se, decimal=10) naive_tvalues = mdf.params / np.sqrt(np.diag(mdf.cov_naive)) assert_almost_equal(naive_tvalues, ols.tvalues, decimal=10) def test_formulas(self): # Check formulas, especially passing groups and time as either # variable names or arrays. n = 100 np.random.seed(34234) Y = np.random.normal(size=n) X1 = np.random.normal(size=n) mat = np.concatenate((np.ones((n, 1)), X1[:, None]), axis=1) Time = np.random.uniform(size=n) groups = np.kron(lrange(20), np.ones(5)) data = pd.DataFrame({"Y": Y, "X1": X1, "Time": Time, "groups": groups}) va = cov_struct.Autoregressive(grid=False) family = families.Gaussian() mod1 = gee.GEE(Y, mat, groups, time=Time, family=family, cov_struct=va) rslt1 = mod1.fit() mod2 = gee.GEE.from_formula("Y ~ X1", groups, data, time=Time, family=family, cov_struct=va) rslt2 = mod2.fit() mod3 = gee.GEE.from_formula("Y ~ X1", groups, data, time="Time", family=family, cov_struct=va) rslt3 = mod3.fit() mod4 = gee.GEE.from_formula("Y ~ X1", "groups", data, time=Time, family=family, cov_struct=va) rslt4 = mod4.fit() mod5 = gee.GEE.from_formula("Y ~ X1", "groups", data, time="Time", family=family, cov_struct=va) rslt5 = mod5.fit() assert_almost_equal(rslt1.params, rslt2.params, decimal=8) assert_almost_equal(rslt1.params, rslt3.params, decimal=8) assert_almost_equal(rslt1.params, rslt4.params, decimal=8) assert_almost_equal(rslt1.params, rslt5.params, decimal=8) check_wrapper(rslt2) def test_compare_logit(self): vs = cov_struct.Independence() family = families.Binomial() np.random.seed(34234) Y = 1 * (np.random.normal(size=100) < 0) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = gee.GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) rslt1 = mod1.fit() mod2 = discrete.Logit.from_formula("Y ~ X1 + X2 + X3", data=D) rslt2 = mod2.fit(disp=False) assert_almost_equal(rslt1.params.values, rslt2.params.values, decimal=10) def test_compare_poisson(self): vs = cov_struct.Independence() family = families.Poisson() np.random.seed(34234) Y = np.ceil(-np.log(np.random.uniform(size=100))) X1 = np.random.normal(size=100) X2 = np.random.normal(size=100) X3 = np.random.normal(size=100) groups = np.random.randint(0, 4, size=100) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) mod1 = gee.GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, family=family, cov_struct=vs) rslt1 = mod1.fit() mod2 = discrete.Poisson.from_formula("Y ~ X1 + X2 + X3", data=D) rslt2 = mod2.fit(disp=False) assert_almost_equal(rslt1.params.values, rslt2.params.values, decimal=10) def test_predict(self): n = 50 np.random.seed(4324) X1 = np.random.normal(size=n) X2 = np.random.normal(size=n) groups = np.kron(np.arange(n / 2), np.r_[1, 1]) offset = np.random.uniform(1, 2, size=n) Y = np.random.normal(0.1 * (X1 + X2) + offset, size=n) data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups, "offset": offset}) fml = "Y ~ X1 + X2" model = gee.GEE.from_formula(fml, groups, data, family=families.Gaussian(), offset="offset") result = model.fit(start_params=[0, 0.1, 0.1]) assert_equal(result.converged, True) pred1 = result.predict() pred2 = result.predict(offset=data.offset) pred3 = result.predict(exog=data[["X1", "X2"]], offset=data.offset) pred4 = result.predict(exog=data[["X1", "X2"]], offset=0 * data.offset) pred5 = result.predict(offset=0 * data.offset) assert_allclose(pred1, pred2) assert_allclose(pred1, pred3) assert_allclose(pred1, pred4 + data.offset) assert_allclose(pred1, pred5 + data.offset) x1_new = np.random.normal(size=10) x2_new = np.random.normal(size=10) new_exog = pd.DataFrame({"X1": x1_new, "X2": x2_new}) pred6 = result.predict(exog=new_exog) params = result.params pred6_correct = params[0] + params[1] * x1_new + params[2] * x2_new assert_allclose(pred6, pred6_correct) def test_stationary_grid(self): endog = np.r_[4, 2, 3, 1, 4, 5, 6, 7, 8, 3, 2, 4.] exog = np.r_[2, 3, 1, 4, 3, 2, 5, 4, 5, 6, 3, 2] group = np.r_[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3] exog = tools.add_constant(exog) cs = cov_struct.Stationary(max_lag=2, grid=True) model = gee.GEE(endog, exog, group, cov_struct=cs) result = model.fit() se = result.bse * np.sqrt(12 / 9.) # Stata adjustment assert_allclose(cs.covariance_matrix(np.r_[1, 1, 1], 0)[0].sum(), 6.4633538285149452) # Obtained from Stata using: # xtgee y x, i(g) vce(robust) corr(Stationary2) assert_allclose(result.params, np.r_[ 4.463968, -0.0386674], rtol=1e-5, atol=1e-5) assert_allclose(se, np.r_[0.5217202, 0.2800333], rtol=1e-5, atol=1e-5) def test_stationary_nogrid(self): # First test special case where the data follow a grid but we # fit using nogrid endog = np.r_[4, 2, 3, 1, 4, 5, 6, 7, 8, 3, 2, 4.] exog = np.r_[2, 3, 1, 4, 3, 2, 5, 4, 5, 6, 3, 2] time = np.r_[0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2] group = np.r_[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3] exog = tools.add_constant(exog) model = gee.GEE(endog, exog, group, cov_struct=cov_struct.Stationary(max_lag=2, grid=False)) result = model.fit() se = result.bse * np.sqrt(12 / 9.) # Stata adjustment # Obtained from Stata using: # xtgee y x, i(g) vce(robust) corr(Stationary2) assert_allclose(result.params, np.r_[ 4.463968, -0.0386674], rtol=1e-5, atol=1e-5) assert_allclose(se, np.r_[0.5217202, 0.2800333], rtol=1e-5, atol=1e-5) # Smoke test for no grid # TODO: pytest.mark.smoke> time = np.r_[0, 1, 3, 0, 2, 3, 0, 2, 3, 0, 1, 2][:, None] model = gee.GEE(endog, exog, group, time=time, cov_struct=cov_struct.Stationary(max_lag=4, grid=False)) model.fit() def test_predict_exposure(self): n = 50 np.random.seed(34234) X1 = np.random.normal(size=n) X2 = np.random.normal(size=n) groups = np.kron(np.arange(25), np.r_[1, 1]) offset = np.random.uniform(1, 2, size=n) exposure = np.random.uniform(1, 2, size=n) Y = np.random.poisson(0.1 * (X1 + X2) + offset + np.log(exposure), size=n) data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups, "offset": offset, "exposure": exposure}) fml = "Y ~ X1 + X2" model = gee.GEE.from_formula(fml, groups, data, family=families.Poisson(), offset="offset", exposure="exposure") result = model.fit() assert_equal(result.converged, True) pred1 = result.predict() pred2 = result.predict(offset=data["offset"]) pred3 = result.predict(exposure=data["exposure"]) pred4 = result.predict( offset=data["offset"], exposure=data["exposure"]) pred5 = result.predict(exog=data[-10:], offset=data["offset"][-10:], exposure=data["exposure"][-10:]) # without patsy pred6 = result.predict(exog=result.model.exog[-10:], offset=data["offset"][-10:], exposure=data["exposure"][-10:], transform=False) assert_allclose(pred1, pred2) assert_allclose(pred1, pred3) assert_allclose(pred1, pred4) assert_allclose(pred1[-10:], pred5) assert_allclose(pred1[-10:], pred6) def test_offset_formula(self): # Test various ways of passing offset and exposure to `from_formula`. n = 50 np.random.seed(34234) X1 = np.random.normal(size=n) X2 = np.random.normal(size=n) groups = np.kron(np.arange(25), np.r_[1, 1]) offset = np.random.uniform(1, 2, size=n) exposure = np.exp(offset) Y = np.random.poisson(0.1 * (X1 + X2) + 2 * offset, size=n) data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups, "offset": offset, "exposure": exposure}) fml = "Y ~ X1 + X2" model1 = gee.GEE.from_formula(fml, groups, data, family=families.Poisson(), offset="offset") result1 = model1.fit() assert_equal(result1.converged, True) model2 = gee.GEE.from_formula(fml, groups, data, family=families.Poisson(), offset=offset) result2 = model2.fit(start_params=result1.params) assert_allclose(result1.params, result2.params) assert_equal(result2.converged, True) model3 = gee.GEE.from_formula(fml, groups, data, family=families.Poisson(), exposure=exposure) result3 = model3.fit(start_params=result1.params) assert_allclose(result1.params, result3.params) assert_equal(result3.converged, True) model4 = gee.GEE.from_formula(fml, groups, data, family=families.Poisson(), exposure="exposure") result4 = model4.fit(start_params=result1.params) assert_allclose(result1.params, result4.params) assert_equal(result4.converged, True) model5 = gee.GEE.from_formula(fml, groups, data, family=families.Poisson(), exposure="exposure", offset="offset") result5 = model5.fit() assert_equal(result5.converged, True) model6 = gee.GEE.from_formula(fml, groups, data, family=families.Poisson(), offset=2 * offset) result6 = model6.fit(start_params=result5.params) assert_allclose(result5.params, result6.params) assert_equal(result6.converged, True) def test_sensitivity(self): va = cov_struct.Exchangeable() family = families.Gaussian() np.random.seed(34234) n = 100 Y = np.random.normal(size=n) X1 = np.random.normal(size=n) X2 = np.random.normal(size=n) groups = np.kron(np.arange(50), np.r_[1, 1]) D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2}) mod = gee.GEE.from_formula("Y ~ X1 + X2", groups, D, family=family, cov_struct=va) rslt = mod.fit() ps = rslt.params_sensitivity(0, 0.5, 2) assert_almost_equal(len(ps), 2) assert_almost_equal([x.cov_struct.dep_params for x in ps], [0.0, 0.5]) # Regression test assert_almost_equal([x.params[0] for x in ps], [0.1696214707458818, 0.17836097387799127]) def test_equivalence(self): """ The Equivalence covariance structure can represent an exchangeable covariance structure. Here we check that the results are identical using the two approaches. """ np.random.seed(3424) endog = np.random.normal(size=20) exog = np.random.normal(size=(20, 2)) exog[:, 0] = 1 groups = np.kron(np.arange(5), np.ones(4)) groups[12:] = 3 # Create unequal size groups # Set up an Equivalence covariance structure to mimic an # Exchangeable covariance structure. pairs = {} start = [0, 4, 8, 12] for k in range(4): pairs[k] = {} # Diagonal values (variance parameters) if k < 3: pairs[k][0] = (start[k] + np.r_[0, 1, 2, 3], start[k] + np.r_[0, 1, 2, 3]) else: pairs[k][0] = (start[k] + np.r_[0, 1, 2, 3, 4, 5, 6, 7], start[k] + np.r_[0, 1, 2, 3, 4, 5, 6, 7]) # Off-diagonal pairs (covariance parameters) if k < 3: a, b = np.tril_indices(4, -1) pairs[k][1] = (start[k] + a, start[k] + b) else: a, b = np.tril_indices(8, -1) pairs[k][1] = (start[k] + a, start[k] + b) ex = cov_struct.Exchangeable() model1 = gee.GEE(endog, exog, groups, cov_struct=ex) result1 = model1.fit() for return_cov in False, True: ec = cov_struct.Equivalence(pairs, return_cov=return_cov) model2 = gee.GEE(endog, exog, groups, cov_struct=ec) result2 = model2.fit() # Use large atol/rtol for the correlation case since there # are some small differences in the results due to degree # of freedom differences. if return_cov is True: atol, rtol = 1e-6, 1e-6 else: atol, rtol = 1e-3, 1e-3 assert_allclose(result1.params, result2.params, atol=atol, rtol=rtol) assert_allclose(result1.bse, result2.bse, atol=atol, rtol=rtol) assert_allclose(result1.scale, result2.scale, atol=atol, rtol=rtol) def test_equivalence_from_pairs(self): np.random.seed(3424) endog = np.random.normal(size=50) exog = np.random.normal(size=(50, 2)) exog[:, 0] = 1 groups = np.kron(np.arange(5), np.ones(10)) groups[30:] = 3 # Create unequal size groups # Set up labels. labels = np.kron(np.arange(5), np.ones(10)).astype(np.int32) labels = labels[np.random.permutation(len(labels))] eq = cov_struct.Equivalence(labels=labels, return_cov=True) model1 = gee.GEE(endog, exog, groups, cov_struct=eq) # Call this directly instead of letting init do it to get the # result before reindexing. eq._pairs_from_labels() # Make sure the size is correct to hold every element. for g in model1.group_labels: p = eq.pairs[g] vl = [len(x[0]) for x in p.values()] m = sum(groups == g) assert_allclose(sum(vl), m * (m + 1) / 2) # Check for duplicates. ixs = set([]) for g in model1.group_labels: for v in eq.pairs[g].values(): for a, b in zip(v[0], v[1]): ky = (a, b) assert(ky not in ixs) ixs.add(ky) # Smoke test # TODO: pytest.mark.smoke? eq = cov_struct.Equivalence(labels=labels, return_cov=True) model1 = gee.GEE(endog, exog, groups, cov_struct=eq) with warnings.catch_warnings(): warnings.simplefilter('ignore') model1.fit(maxiter=2) class CheckConsistency(object): start_params = None def test_cov_type(self): mod = self.mod res_robust = mod.fit(start_params=self.start_params) res_naive = mod.fit(start_params=self.start_params, cov_type='naive') res_robust_bc = mod.fit(start_params=self.start_params, cov_type='bias_reduced') # call summary to make sure it does not change cov_type res_naive.summary() res_robust_bc.summary() # check cov_type assert_equal(res_robust.cov_type, 'robust') assert_equal(res_naive.cov_type, 'naive') assert_equal(res_robust_bc.cov_type, 'bias_reduced') # check bse and cov_params # we are comparing different runs of the optimization # bse in ordinal and multinomial have an atol around 5e-10 for two # consecutive calls to fit. rtol = 1e-8 for (res, cov_type, cov) in [ (res_robust, 'robust', res_robust.cov_robust), (res_naive, 'naive', res_robust.cov_naive), (res_robust_bc, 'bias_reduced', res_robust_bc.cov_robust_bc) ]: bse = np.sqrt(np.diag(cov)) assert_allclose(res.bse, bse, rtol=rtol) if cov_type != 'bias_reduced': # cov_type=naive shortcuts calculation of bias reduced # covariance for efficiency bse = res_naive.standard_errors(cov_type=cov_type) assert_allclose(res.bse, bse, rtol=rtol) assert_allclose(res.cov_params(), cov, rtol=rtol, atol=1e-10) assert_allclose(res.cov_params_default, cov, rtol=rtol, atol=1e-10) # assert that we do not have a copy assert_(res_robust.cov_params_default is res_robust.cov_robust) assert_(res_naive.cov_params_default is res_naive.cov_naive) assert_(res_robust_bc.cov_params_default is res_robust_bc.cov_robust_bc) # check exception for misspelled cov_type assert_raises(ValueError, mod.fit, cov_type='robust_bc') class TestGEEPoissonCovType(CheckConsistency): @classmethod def setup_class(cls): endog, exog, group_n = load_data("gee_poisson_1.csv") family = families.Poisson() vi = cov_struct.Independence() cls.mod = gee.GEE(endog, exog, group_n, None, family, vi) cls.start_params = np.array([-0.03644504, -0.05432094, 0.01566427, 0.57628591, -0.0046566, -0.47709315]) def test_wrapper(self): endog, exog, group_n = load_data("gee_poisson_1.csv", icept=False) endog = pd.Series(endog) exog = pd.DataFrame(exog) group_n = pd.Series(group_n) family = families.Poisson() vi = cov_struct.Independence() mod = gee.GEE(endog, exog, group_n, None, family, vi) rslt2 = mod.fit() check_wrapper(rslt2) class TestGEEPoissonFormulaCovType(CheckConsistency): @classmethod def setup_class(cls): endog, exog, group_n = load_data("gee_poisson_1.csv") family = families.Poisson() vi = cov_struct.Independence() # Test with formulas D = np.concatenate((endog[:, None], group_n[:, None], exog[:, 1:]), axis=1) D = pd.DataFrame(D) D.columns = ["Y", "Id", ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)] cls.mod = gee.GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5", "Id", D, family=family, cov_struct=vi) cls.start_params = np.array([-0.03644504, -0.05432094, 0.01566427, 0.57628591, -0.0046566, -0.47709315]) class TestGEEOrdinalCovType(CheckConsistency): @classmethod def setup_class(cls): family = families.Binomial() endog, exog, groups = load_data("gee_ordinal_1.csv", icept=False) va = cov_struct.GlobalOddsRatio("ordinal") cls.mod = gee.OrdinalGEE(endog, exog, groups, None, family, va) cls.start_params = np.array([1.09250002, 0.0217443, -0.39851092, -0.01812116, 0.03023969, 1.18258516, 0.01803453, -1.10203381]) def test_wrapper(self): endog, exog, groups = load_data("gee_ordinal_1.csv", icept=False) endog = pd.Series(endog, name='yendog') exog = pd.DataFrame(exog) groups = pd.Series(groups, name='the_group') family = families.Binomial() va = cov_struct.GlobalOddsRatio("ordinal") mod = gee.OrdinalGEE(endog, exog, groups, None, family, va) rslt2 = mod.fit() check_wrapper(rslt2) class TestGEEMultinomialCovType(CheckConsistency): @classmethod def setup_class(cls): endog, exog, groups = load_data("gee_nominal_1.csv", icept=False) # Test with independence correlation va = cov_struct.Independence() cls.mod = gee.NominalGEE(endog, exog, groups, cov_struct=va) cls.start_params = np.array([0.44944752, 0.45569985, -0.92007064, -0.46766728]) def test_wrapper(self): endog, exog, groups = load_data("gee_nominal_1.csv", icept=False) endog = pd.Series(endog, name='yendog') exog = pd.DataFrame(exog) groups = pd.Series(groups, name='the_group') va = cov_struct.Independence() mod = gee.NominalGEE(endog, exog, groups, cov_struct=va) rslt2 = mod.fit() check_wrapper(rslt2) def test_regularized_poisson(): np.random.seed(8735) ng, gs, p = 1000, 5, 5 x = np.random.normal(size=(ng*gs, p)) r = 0.5 x[:, 2] = r*x[:, 1] + np.sqrt(1-r**2)*x[:, 2] lpr = 0.7*(x[:, 1] - x[:, 3]) mean = np.exp(lpr) y = np.random.poisson(mean) groups = np.kron(np.arange(ng), np.ones(gs)) model = gee.GEE(y, x, groups=groups, family=families.Poisson()) result = model.fit_regularized(0.0000001) assert_allclose(result.params, 0.7 * np.r_[0, 1, 0, -1, 0], rtol=0.01, atol=0.12) def test_regularized_gaussian(): # Example 1 from Wang et al. np.random.seed(8735) ng, gs, p = 200, 4, 200 groups = np.kron(np.arange(ng), np.ones(gs)) x = np.zeros((ng*gs, p)) x[:, 0] = 1 * (np.random.uniform(size=ng*gs) < 0.5) x[:, 1] = np.random.normal(size=ng*gs) r = 0.5 for j in range(2, p): eps = np.random.normal(size=ng*gs) x[:, j] = r * x[:, j-1] + np.sqrt(1 - r**2) * eps lpr = np.dot(x[:, 0:4], np.r_[2, 3, 1.5, 2]) s = 0.4 e = np.sqrt(s) * np.kron(np.random.normal(size=ng), np.ones(gs)) e += np.sqrt(1 - s) * np.random.normal(size=ng*gs) y = lpr + e model = gee.GEE(y, x, cov_struct=cov_struct.Exchangeable(), groups=groups) result = model.fit_regularized(0.01, maxiter=100) ex = np.zeros(200) ex[0:4] = np.r_[2, 3, 1.5, 2] assert_allclose(result.params, ex, rtol=0.01, atol=0.2) assert_allclose(model.cov_struct.dep_params, np.r_[s], rtol=0.01, atol=0.05) @pytest.mark.smoke @pytest.mark.matplotlib def test_plots(close_figures): np.random.seed(378) exog = np.random.normal(size=100) endog = np.random.normal(size=(100, 2)) groups = np.kron(np.arange(50), np.r_[1, 1]) model = gee.GEE(exog, endog, groups) result = model.fit() fig = result.plot_added_variable(1) assert_equal(isinstance(fig, plt.Figure), True) fig = result.plot_partial_residuals(1) assert_equal(isinstance(fig, plt.Figure), True) fig = result.plot_ceres_residuals(1) assert_equal(isinstance(fig, plt.Figure), True) fig = result.plot_isotropic_dependence() assert_equal(isinstance(fig, plt.Figure), True) def test_missing(): # gh-1877 data = [['id', 'al', 'status', 'fake', 'grps'], ['4A', 'A', 1, 1, 0], ['5A', 'A', 1, 2.0, 1], ['6A', 'A', 1, 3, 2], ['7A', 'A', 1, 2.0, 3], ['8A', 'A', 1, 1, 4], ['9A', 'A', 1, 2.0, 5], ['11A', 'A', 1, 1, 6], ['12A', 'A', 1, 2.0, 7], ['13A', 'A', 1, 1, 8], ['14A', 'A', 1, 1, 9], ['15A', 'A', 1, 1, 10], ['16A', 'A', 1, 2.0, 11], ['17A', 'A', 1, 3.0, 12], ['18A', 'A', 1, 3.0, 13], ['19A', 'A', 1, 2.0, 14], ['20A', 'A', 1, 2.0, 15], ['2C', 'C', 0, 3.0, 0], ['3C', 'C', 0, 1, 1], ['4C', 'C', 0, 1, 2], ['5C', 'C', 0, 2.0, 3], ['6C', 'C', 0, 1, 4], ['9C', 'C', 0, 1, 5], ['10C', 'C', 0, 3, 6], ['12C', 'C', 0, 3, 7], ['14C', 'C', 0, 2.5, 8], ['15C', 'C', 0, 1, 9], ['17C', 'C', 0, 1, 10], ['22C', 'C', 0, 1, 11], ['23C', 'C', 0, 1, 12], ['24C', 'C', 0, 1, 13], ['32C', 'C', 0, 2.0, 14], ['35C', 'C', 0, 1, 15]] df = pd.DataFrame(data[1:], columns=data[0]) df.loc[df.fake == 1, 'fake'] = np.nan mod = gee.GEE.from_formula('status ~ fake', data=df, groups='grps', cov_struct=cov_struct.Independence(), family=families.Binomial()) df = df.dropna().copy() df['constant'] = 1 mod2 = gee.GEE(df.status, df[['constant', 'fake']], groups=df.grps, cov_struct=cov_struct.Independence(), family=families.Binomial()) assert_equal(mod.endog, mod2.endog) assert_equal(mod.exog, mod2.exog) assert_equal(mod.groups, mod2.groups) res = mod.fit() res2 = mod2.fit() assert_almost_equal(res.params.values, res2.params.values) def simple_qic_data(fam): y = np.r_[0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0] x1 = np.r_[0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0] x2 = np.r_[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] g = np.r_[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4] x1 = x1[:, None] x2 = x2[:, None] return y, x1, x2, g # Test quasi-likelihood by numerical integration in two settings # where there is a closed form expression. @pytest.mark.parametrize("family", [families.Gaussian, families.Poisson]) def test_ql_known(family): fam = family() y, x1, x2, g = simple_qic_data(family) model1 = gee.GEE(y, x1, family=fam, groups=g) result1 = model1.fit(ddof_scale=0) mean1 = result1.fittedvalues model2 = gee.GEE(y, x2, family=fam, groups=g) result2 = model2.fit(ddof_scale=0) mean2 = result2.fittedvalues if family is families.Gaussian: ql1 = -len(y) / 2. ql2 = -len(y) / 2. elif family is families.Poisson: c = np.zeros_like(y) ii = y > 0 c[ii] = y[ii] * np.log(y[ii]) - y[ii] ql1 = np.sum(y * np.log(mean1) - mean1 - c) ql2 = np.sum(y * np.log(mean2) - mean2 - c) else: raise ValueError("Unknown family") qle1 = model1.qic(result1.params, result1.scale, result1.cov_params()) qle2 = model2.qic(result2.params, result2.scale, result2.cov_params()) assert_allclose(ql1, qle1[0], rtol=1e-4) assert_allclose(ql2, qle2[0], rtol=1e-4) with warnings.catch_warnings(): warnings.simplefilter("ignore") qler1 = result1.qic() qler2 = result2.qic() assert_allclose(qler1, qle1[1:], rtol=1e-5) assert_allclose(qler2, qle2[1:], rtol=1e-5) # Compare differences of QL values computed by numerical integration. # Use difference here so that constants that are inconvenient to compute # cancel out. @pytest.mark.parametrize("family", [families.Gaussian, families.Binomial, families.Poisson]) def test_ql_diff(family): fam = family() y, x1, x2, g = simple_qic_data(family) model1 = gee.GEE(y, x1, family=fam, groups=g) result1 = model1.fit(ddof_scale=0) mean1 = result1.fittedvalues model2 = gee.GEE(y, x2, family=fam, groups=g) result2 = model2.fit(ddof_scale=0) mean2 = result2.fittedvalues if family is families.Gaussian: qldiff = 0 elif family is families.Binomial: qldiff = np.sum(y * np.log(mean1 / (1 - mean1)) + np.log(1 - mean1)) qldiff -= np.sum(y * np.log(mean2 / (1 - mean2)) + np.log(1 - mean2)) elif family is families.Poisson: qldiff = (np.sum(y * np.log(mean1) - mean1) - np.sum(y * np.log(mean2) - mean2)) else: raise ValueError("unknown family") qle1, _, _ = model1.qic(result1.params, result1.scale, result1.cov_params()) qle2, _, _ = model2.qic(result2.params, result2.scale, result2.cov_params()) assert_allclose(qle1 - qle2, qldiff, rtol=1e-5, atol=1e-5) def test_qic_warnings(): with pytest.warns(UserWarning): fam = families.Gaussian() y, x1, _, g = simple_qic_data(fam) model = gee.GEE(y, x1, family=fam, groups=g) result = model.fit() result.qic() @pytest.mark.parametrize("reg", [False, True]) def test_quasipoisson(reg): np.random.seed(343) n = 1000 x = np.random.normal(size=(n, 3)) g = np.random.gamma(1, 1, size=n) y = np.random.poisson(g) grp = np.kron(np.arange(100), np.ones(n // 100)) model1 = gee.GEE(y, x, family=families.Poisson(), groups=grp, ) model2 = gee.GEE(y, x, family=families.Poisson(), groups=grp, ) if reg: result1 = model1.fit_regularized(pen_wt=0.1) result2 = model2.fit_regularized(pen_wt=0.1, scale="X2") else: result1 = model1.fit(cov_type="naive") result2 = model2.fit(scale="X2", cov_type="naive") # The parameter estimates are the same regardless of how # the scale parameter is handled assert_allclose(result1.params, result2.params) if not reg: # The robust covariance does not depend on the scale parameter, # but the naive covariance does. assert_allclose(result2.cov_naive / result1.cov_naive, result2.scale * np.ones_like(result2.cov_naive)) def test_grid_ar(): np.random.seed(243) r = 0.5 m = 10 ng = 100 ii = np.arange(m) cov = r**np.abs(np.subtract.outer(ii, ii)) covr = np.linalg.cholesky(cov) e = [np.dot(covr, np.random.normal(size=m)) for k in range(ng)] e = 2 * np.concatenate(e) grps = [[k]*m for k in range(ng)] grps = np.concatenate(grps) x = np.random.normal(size=(ng*m, 3)) y = np.dot(x, np.r_[1, -1, 0]) + e model1 = gee.GEE(y, x, groups=grps, cov_struct=cov_struct.Autoregressive(grid=False)) result1 = model1.fit() model2 = gee.GEE(y, x, groups=grps, cov_struct=cov_struct.Autoregressive(grid=True)) result2 = model2.fit() model3 = gee.GEE(y, x, groups=grps, cov_struct=cov_struct.Stationary(max_lag=1, grid=False)) result3 = model3.fit() assert_allclose(result1.cov_struct.dep_params, result2.cov_struct.dep_params, rtol=0.05) assert_allclose(result1.cov_struct.dep_params, result3.cov_struct.dep_params[1], rtol=0.05) def test_unstructured_complete(): np.random.seed(43) ngrp = 400 cov = np.asarray([[1, 0.7, 0.2], [0.7, 1, 0.5], [0.2, 0.5, 1]]) covr = np.linalg.cholesky(cov) e = np.random.normal(size=(ngrp, 3)) e = np.dot(e, covr.T) xmat = np.random.normal(size=(3*ngrp, 3)) par = np.r_[1, -2, 0.1] ey = np.dot(xmat, par) y = ey + e.ravel() g = np.kron(np.arange(ngrp), np.ones(3)) t = np.kron(np.ones(ngrp), np.r_[0, 1, 2]).astype(int) m = gee.GEE(y, xmat, time=t, cov_struct=cov_struct.Unstructured(), groups=g) r = m.fit() assert_allclose(r.params, par, 0.05, 0.5) assert_allclose(m.cov_struct.dep_params, cov, 0.05, 0.5) def test_unstructured_incomplete(): np.random.seed(43) ngrp = 400 cov = np.asarray([[1, 0.7, 0.2], [0.7, 1, 0.5], [0.2, 0.5, 1]]) covr = np.linalg.cholesky(cov) e = np.random.normal(size=(ngrp, 3)) e = np.dot(e, covr.T) xmat = np.random.normal(size=(3*ngrp, 3)) par = np.r_[1, -2, 0.1] ey = np.dot(xmat, par) yl, xl, tl, gl = [], [], [], [] for i in range(ngrp): # Omit one observation from each group of 3 ix = [0, 1, 2] ix.pop(i % 3) ix = np.asarray(ix) tl.append(ix) yl.append(ey[3*i + ix] + e[i, ix]) x = xmat[3*i + ix, :] xl.append(x) gl.append(i * np.ones(2)) y = np.concatenate(yl) x = np.concatenate(xl, axis=0) t = np.concatenate(tl) t = np.asarray(t, dtype=int) g = np.concatenate(gl) m = gee.GEE(y, x, time=t[:, None], cov_struct=cov_struct.Unstructured(), groups=g) r = m.fit() assert_allclose(r.params, par, 0.05, 0.5) assert_allclose(m.cov_struct.dep_params, cov, 0.05, 0.5) def test_ar_covsolve(): np.random.seed(123) c = cov_struct.Autoregressive(grid=True) c.dep_params = 0.4 for d in 1, 2, 4: for q in 1, 4: ii = np.arange(d) mat = 0.4 ** np.abs(np.subtract.outer(ii, ii)) sd = np.random.uniform(size=d) if q == 1: z = np.random.normal(size=d) else: z = np.random.normal(size=(d, q)) sm = np.diag(sd) z1 = np.linalg.solve(sm, np.linalg.solve(mat, np.linalg.solve(sm, z))) z2 = c.covariance_matrix_solve(np.zeros_like(sd), np.zeros_like(sd), sd, [z]) assert_allclose(z1, z2[0], rtol=1e-5, atol=1e-5) def test_ex_covsolve(): np.random.seed(123) c = cov_struct.Exchangeable() c.dep_params = 0.4 for d in 1, 2, 4: for q in 1, 4: mat = 0.4 * np.ones((d, d)) + 0.6 * np.eye(d) sd = np.random.uniform(size=d) if q == 1: z = np.random.normal(size=d) else: z = np.random.normal(size=(d, q)) sm = np.diag(sd) z1 = np.linalg.solve(sm, np.linalg.solve(mat, np.linalg.solve(sm, z))) z2 = c.covariance_matrix_solve(np.zeros_like(sd), np.arange(d, dtype=int), sd, [z]) assert_allclose(z1, z2[0], rtol=1e-5, atol=1e-5) def test_stationary_covsolve(): np.random.seed(123) c = cov_struct.Stationary(grid=True) c.time = np.arange(10, dtype=int) for d in 1, 2, 4: for q in 1, 4: c.dep_params = (2.0 ** (-np.arange(d))) c.max_lag = d - 1 mat, _ = c.covariance_matrix(np.zeros(d), np.arange(d, dtype=int)) sd = np.random.uniform(size=d) if q == 1: z = np.random.normal(size=d) else: z = np.random.normal(size=(d, q)) sm = np.diag(sd) z1 = np.linalg.solve(sm, np.linalg.solve(mat, np.linalg.solve(sm, z))) z2 = c.covariance_matrix_solve(np.zeros_like(sd), np.arange(d, dtype=int), sd, [z]) assert_allclose(z1, z2[0], rtol=1e-5, atol=1e-5)