import numpy as np """ Generate data sets for testing Cox proportional hazards regression models. After updating the test data sets, use R to run the survival.R script to update the R results. """ # The current data may not reflect this seed np.random.seed(5234) # Loop over pairs containing (sample size, number of variables). for (n, p) in (20, 1), (50, 1), (50, 2), (100, 5), (1000, 10): exog = np.random.normal(size=(5*n, p)) coef = np.linspace(-0.5, 0.5, p) lpred = np.dot(exog, coef) expected_survival_time = np.exp(-lpred) # Survival times are exponential survival_time = -np.log(np.random.uniform(size=5*n)) survival_time *= expected_survival_time # Set this to get a reasonable amount of censoring expected_censoring_time = np.mean(expected_survival_time) # Theses are the observation times. censoring_time = -np.log(np.random.uniform(size=5*n)) censoring_time *= expected_censoring_time # Entry times entry_time = -np.log(np.random.uniform(size=5*n)) entry_time *= 0.5*expected_censoring_time # 1=failure (death), 0=no failure (no death) status = 1*(survival_time <= censoring_time) # The censoring time of the failure time, whichever comes first time = np.where(status == 1, survival_time, censoring_time) # Round time so that we have ties time = np.around(time, decimals=1) # Only take cases where the entry time is before the failure or # censoring time. Take exactly n such cases. ii = np.flatnonzero(entry_time < time) ii = ii[np.random.permutation(len(ii))[0:n]] status = status[ii] time = time[ii] exog = exog[ii, :] entry_time = entry_time[ii] data = np.concatenate((time[:, None], status[:, None], entry_time[:, None], exog), axis=1) fname = "results/survival_data_%d_%d.csv" % (n, p) np.savetxt(fname, data, fmt="%.5f")