import numpy as np from scipy.stats.distributions import norm def generate_logistic(): # Number of clusters nclust = 100 # Regression coefficients beta = np.array([1, -2, 1], dtype=np.float64) # Covariate correlations r = 0.4 # Cluster effects of covariates rx = 0.5 # Within-cluster outcome dependence re = 0.3 p = len(beta) OUT = open("gee_logistic_1.csv", "w") for i in range(nclust): n = np.random.randint(3, 6) # Cluster size x = np.random.normal(size=(n, p)) x = rx*np.random.normal() + np.sqrt(1-rx**2)*x x[:, 2] = r*x[:, 1] + np.sqrt(1-r**2)*x[:, 2] pr = 1/(1+np.exp(-np.dot(x, beta))) z = re*np.random.normal() +\ np.sqrt(1-re**2)*np.random.normal(size=n) u = norm.cdf(z) y = 1*(u < pr) for j in range(n): OUT.write("%d, %d," % (i, y[j])) OUT.write(",".join(["%.3f" % b for b in x[j, :]]) + "\n") OUT.close() def generate_linear(): # Number of clusters nclust = 100 # Regression coefficients beta = np.array([1, -2, 1], dtype=np.float64) # Within cluster covariate correlations r = 0.4 # Between cluster covariate effects rx = 0.5 # Within-cluster outcome dependence # re = 0.3 p = len(beta) OUT = open("gee_linear_1.csv", "w") for i in range(nclust): n = np.random.randint(3, 6) # Cluster size x = np.random.normal(size=(n, p)) x = rx*np.random.normal() + np.sqrt(1-rx**2)*x x[:, 2] = r*x[:, 1] + np.sqrt(1-r**2)*x[:, 2] # TODO: should `e` be used somewhere? # e = np.sqrt(1-re**2)*np.random.normal(size=n) + re*np.random.normal() y = np.dot(x, beta) + np.random.normal(size=n) for j in range(n): OUT.write("%d, %d," % (i, y[j])) OUT.write(",".join(["%.3f" % b for b in x[j, :]]) + "\n") OUT.close() def generate_nested_linear(): # Number of clusters (clusters have 10 values, partitioned into 2 # subclusters of size 5). nclust = 200 # Regression coefficients beta = np.array([1, -2, 1], dtype=np.float64) # Top level cluster variance component v1 = 1 # Subcluster variance component v2 = 0.5 # Error variance component v3 = 1.5 p = len(beta) OUT = open("gee_nested_linear_1.csv", "w") for i in range(nclust): x = np.random.normal(size=(10, p)) y = np.dot(x, beta) y += np.sqrt(v1)*np.random.normal() y[0:5] += np.sqrt(v2)*np.random.normal() y[5:10] += np.sqrt(v2)*np.random.normal() y += np.sqrt(v3)*np.random.normal(size=10) for j in range(10): OUT.write("%d, %.3f," % (i, y[j])) OUT.write(",".join(["%.3f" % b for b in x[j, :]]) + "\n") OUT.close() def generate_ordinal(): # Regression coefficients beta = np.zeros(5, dtype=np.float64) beta[2] = 1 beta[4] = -1 rz = 0.5 OUT = open("gee_ordinal_1.csv", "w") for i in range(200): n = np.random.randint(3, 6) # Cluster size x = np.random.normal(size=(n, 5)) for j in range(5): x[:, j] += np.random.normal() pr = np.dot(x, beta) pr = np.array([1, 0, -0.5]) + pr[:, None] pr = 1 / (1 + np.exp(-pr)) z = rz*np.random.normal() +\ np.sqrt(1-rz**2)*np.random.normal(size=n) u = norm.cdf(z) y = (u[:, None] > pr).sum(1) for j in range(n): OUT.write("%d, %d," % (i, y[j])) OUT.write(",".join(["%.3f" % b for b in x[j, :]]) + "\n") OUT.close() def generate_nominal(): # Regression coefficients beta1 = np.r_[0.5, 0.5] beta2 = np.r_[-1, -0.5] p = len(beta1) rz = 0.5 OUT = open("gee_nominal_1.csv", "w") for i in range(200): n = np.random.randint(3, 6) # Cluster size x = np.random.normal(size=(n, p)) x[:, 0] = 1 for j in range(1, x.shape[1]): x[:, j] += np.random.normal() pr1 = np.exp(np.dot(x, beta1))[:, None] pr2 = np.exp(np.dot(x, beta2))[:, None] den = 1 + pr1 + pr2 pr = np.hstack((pr1/den, pr2/den, 1/den)) cpr = np.cumsum(pr, 1) z = rz*np.random.normal() +\ np.sqrt(1-rz**2)*np.random.normal(size=n) u = norm.cdf(z) y = (u[:, None] > cpr).sum(1) for j in range(n): OUT.write("%d, %d," % (i, y[j])) OUT.write(",".join(["%.3f" % b for b in x[j, :]]) + "\n") OUT.close() def generate_poisson(): # Regression coefficients beta = np.zeros(5, dtype=np.float64) beta[2] = 0.5 beta[4] = -0.5 nclust = 100 OUT = open("gee_poisson_1.csv", "w") for i in range(nclust): n = np.random.randint(3, 6) # Cluster size x = np.random.normal(size=(n, 5)) for j in range(5): x[:, j] += np.random.normal() lp = np.dot(x, beta) E = np.exp(lp) y = [np.random.poisson(e) for e in E] y = np.array(y) for j in range(n): OUT.write("%d, %d," % (i, y[j])) OUT.write(",".join(["%.3f" % b for b in x[j, :]]) + "\n") OUT.close()