""" Assesment of Generalized Estimating Equations using simulation. This script checks Poisson models. See the generated file "gee_poisson_simulation_check.txt" for results. """ import numpy as np from statsmodels.genmod.families import Poisson from .gee_gaussian_simulation_check import GEE_simulator from statsmodels.genmod.generalized_estimating_equations import GEE from statsmodels.genmod.cov_struct import Exchangeable,Independence class Exchangeable_simulator(GEE_simulator): """ Simulate exchangeable Poisson data. The data within a cluster are simulated as y_i = z_c + z_i. The z_c, and {z_i} are independent Poisson random variables with expected values e_c and {e_i}, respectively. In order for the pairwise correlation to be equal to `f` for all pairs, we need e_c / sqrt((e_c + e_i) * (e_c + e_j)) = f for all i, j. By setting all e_i = e within a cluster, these equations can be satisfied. We thus need e_c * (1 - f) = f * e, which can be solved (non-uniquely) for e and e_c. """ scale_inv = 1. def print_dparams(self, dparams_est): OUT.write("Estimated common pairwise correlation: %8.4f\n" % dparams_est[0]) OUT.write("True common pairwise correlation: %8.4f\n" % self.dparams[0]) OUT.write("Estimated inverse scale parameter: %8.4f\n" % dparams_est[1]) OUT.write("True inverse scale parameter: %8.4f\n" % self.scale_inv) OUT.write("\n") def simulate(self): endog, exog, group, time = [], [], [], [] # Get a basis for the orthogonal complement to params. f = np.sum(self.params**2) u,s,vt = np.linalg.svd(np.eye(len(self.params)) - np.outer(self.params, self.params) / f) params0 = u[:,np.flatnonzero(s > 1e-6)] for i in range(self.ngroups): gsize = np.random.randint(self.group_size_range[0], self.group_size_range[1]) group.append([i,] * gsize) time1 = np.random.normal(size=(gsize, 2)) time.append(time1) e_c = np.random.uniform(low=1, high=10) e = e_c * (1 - self.dparams[0]) / self.dparams[0] common = np.random.poisson(e_c) unique = np.random.poisson(e, gsize) endog1 = common + unique endog.append(endog1) lpr = np.log(e_c + e) * np.ones(gsize) # Create an exog matrix so that E[Y] = log(dot(exog1, params)) exog1 = np.outer(lpr, self.params) / np.sum(self.params**2) emat = np.random.normal(size=(len(lpr), params0.shape[1])) exog1 += np.dot(emat, params0.T) exog.append(exog1) self.exog = np.concatenate(exog, axis=0) self.endog = np.concatenate(endog) self.time = np.concatenate(time, axis=0) self.group = np.concatenate(group) class Overdispersed_simulator(GEE_simulator): """ Use the negative binomial distribution to check GEE estimation using the overdispered Poisson model with independent dependence. Simulating X = np.random.negative_binomial(n, p, size) then EX = (1 - p) * n / p Var(X) = (1 - p) * n / p**2 These equations can be inverted as follows: p = E / V n = E * p / (1 - p) dparams[0] is the common correlation coefficient """ def print_dparams(self, dparams_est): OUT.write("Estimated inverse scale parameter: %8.4f\n" % dparams_est[0]) OUT.write("True inverse scale parameter: %8.4f\n" % self.scale_inv) OUT.write("\n") def simulate(self): endog, exog, group, time = [], [], [], [] # Get a basis for the orthogonal complement to params. f = np.sum(self.params**2) u,s,vt = np.linalg.svd(np.eye(len(self.params)) - np.outer(self.params, self.params) / f) params0 = u[:,np.flatnonzero(s > 1e-6)] for i in range(self.ngroups): gsize = np.random.randint(self.group_size_range[0], self.group_size_range[1]) group.append([i,] * gsize) time1 = np.random.normal(size=(gsize, 2)) time.append(time1) exog1 = np.random.normal(size=(gsize, len(self.params))) exog.append(exog1) E = np.exp(np.dot(exog1, self.params)) V = E * self.scale_inv p = E / V n = E * p / (1 - p) endog1 = np.random.negative_binomial(n, p, gsize) endog.append(endog1) self.exog = np.concatenate(exog, axis=0) self.endog = np.concatenate(endog) self.time = np.concatenate(time, axis=0) self.group = np.concatenate(group) def gendat_exchangeable(): exs = Exchangeable_simulator() exs.params = np.r_[2., 0.2, 0.2, -0.1, -0.2] exs.ngroups = 200 exs.dparams = [0.3,] exs.simulate() return exs, Exchangeable() def gendat_overdispersed(): exs = Overdispersed_simulator() exs.params = np.r_[2., 0.2, 0.2, -0.1, -0.2] exs.ngroups = 200 exs.scale_inv = 2. exs.dparams = [] exs.simulate() return exs, Independence() if __name__ == "__main__": np.set_printoptions(formatter={'all': lambda x: "%8.3f" % x}, suppress=True) OUT = open("gee_poisson_simulation_check.txt", "w") nrep = 100 gendats = [gendat_exchangeable, gendat_overdispersed] lhs = np.array([[0., 1, -1, 0, 0],]) rhs = np.r_[0.0,] # Loop over data generating models for gendat in gendats: pvalues = [] params = [] std_errors = [] dparams = [] for j in range(nrep): da, va = gendat() ga = Poisson() # Poisson seems to be more sensitive to starting values, # so we run the independence model first. md = GEE(da.endog, da.exog, da.group, da.time, ga, Independence()) mdf = md.fit() md = GEE(da.endog, da.exog, da.group, da.time, ga, va) mdf = md.fit(start_params = mdf.params) if mdf is None or (not mdf.converged): print("Failed to converge") continue scale_inv = 1. / md.estimate_scale() dparams.append(np.r_[va.dparams, scale_inv]) params.append(np.asarray(mdf.params)) std_errors.append(np.asarray(mdf.standard_errors)) da,va = gendat() ga = Poisson() md = GEE(da.endog, da.exog, da.group, da.time, ga, va, constraint=(lhs, rhs)) mdf = md.fit() if mdf is None or (not mdf.converged): print("Failed to converge") continue score = md.score_test_results pvalue = score["p-value"] pvalues.append(pvalue) dparams_mean = np.array(sum(dparams) / len(dparams)) OUT.write("Results based on %d successful fits out of %d data sets.\n\n" % (len(dparams), nrep)) OUT.write("Checking dependence parameters:\n") da.print_dparams(dparams_mean) params = np.array(params) eparams = params.mean(0) sdparams = params.std(0) std_errors = np.array(std_errors) std_errors = std_errors.mean(0) OUT.write("Checking parameter values:\n") OUT.write("Observed: ") OUT.write(np.array_str(eparams) + "\n") OUT.write("Expected: ") OUT.write(np.array_str(da.params) + "\n") OUT.write("Absolute difference: ") OUT.write(np.array_str(eparams - da.params) + "\n") OUT.write("Relative difference: ") OUT.write(np.array_str((eparams - da.params) / da.params) + "\n") OUT.write("\n") OUT.write("Checking standard errors\n") OUT.write("Observed: ") OUT.write(np.array_str(sdparams) + "\n") OUT.write("Expected: ") OUT.write(np.array_str(std_errors) + "\n") OUT.write("Absolute difference: ") OUT.write(np.array_str(sdparams - std_errors) + "\n") OUT.write("Relative difference: ") OUT.write(np.array_str((sdparams - std_errors) / std_errors) + "\n") OUT.write("\n") pvalues.sort() OUT.write("Checking constrained estimation:\n") OUT.write("Left hand side:\n") OUT.write(np.array_str(lhs) + "\n") OUT.write("Right hand side:\n") OUT.write(np.array_str(rhs) + "\n") OUT.write("Observed p-values Expected Null p-values\n") for q in np.arange(0.1, 0.91, 0.1): OUT.write("%20.3f %20.3f\n" % (pvalues[int(q*len(pvalues))], q)) OUT.write("=" * 80 + "\n\n") OUT.close()