# -*- coding: utf-8 -*- """ Created on Mon May 05 17:29:56 2014 Author: Josef Perktold """ import os import numpy as np import pandas as pd import pytest from numpy.testing import assert_allclose from statsmodels.regression.linear_model import OLS, GLS from statsmodels.sandbox.regression.penalized import TheilGLS class TestTheilTextile(object): @classmethod def setup_class(cls): cur_dir = os.path.dirname(os.path.abspath(__file__)) filepath = os.path.join(cur_dir, "results", "theil_textile_predict.csv") cls.res_predict = pd.read_csv(filepath, sep=",") names = "year lconsump lincome lprice".split() data = np.array('''\ 1923 1.99651 1.98543 2.00432 1924 1.99564 1.99167 2.00043 1925 2 2 2 1926 2.04766 2.02078 1.95713 1927 2.08707 2.02078 1.93702 1928 2.07041 2.03941 1.95279 1929 2.08314 2.04454 1.95713 1930 2.13354 2.05038 1.91803 1931 2.18808 2.03862 1.84572 1932 2.18639 2.02243 1.81558 1933 2.20003 2.00732 1.78746 1934 2.14799 1.97955 1.79588 1935 2.13418 1.98408 1.80346 1936 2.22531 1.98945 1.72099 1937 2.18837 2.0103 1.77597 1938 2.17319 2.00689 1.77452 1939 2.2188 2.0162 1.78746'''.split(), float).reshape(-1, 4) endog = data[:, 1] # constant at the end to match Stata exog = np.column_stack((data[:, 2:], np.ones(endog.shape[0]))) #prior(lprice -0.7 0.15 lincome 1 0.15) cov(lprice lincome -0.01) r_matrix = np.array([[1, 0, 0], [0, 1, 0]]) r_mean = [1, -0.7] cov_r = np.array([[0.15**2, -0.01], [-0.01, 0.15**2]]) mod = TheilGLS(endog, exog, r_matrix, q_matrix=r_mean, sigma_prior=cov_r) cls.res1 = mod.fit(cov_type='data-prior', use_t=True) #cls.res1._cache['scale'] = 0.0001852252884817586 # from tg_mixed cls.res1._cache['scale'] = 0.00018334123641580062 # from OLS from .results import results_theil_textile as resmodule cls.res2 = resmodule.results_theil_textile def test_basic(self): pt = self.res2.params_table[:,:6].T params2, bse2, tvalues2, pvalues2, ci_low, ci_upp = pt assert_allclose(self.res1.params, params2, rtol=2e-6) #TODO tgmixed seems to use scale from initial OLS, not from final res # np.sqrt(res.scale / res_ols.scale) # see below mse_resid which is equal to scale corr_fact = 0.9836026210570028 corr_fact = 0.97376865041463734 corr_fact = 1 assert_allclose(self.res1.bse / corr_fact, bse2, rtol=2e-6) assert_allclose(self.res1.tvalues * corr_fact, tvalues2, rtol=2e-6) # pvalues are very small #assert_allclose(self.res1.pvalues, pvalues2, atol=2e-6) #assert_allclose(self.res1.pvalues, pvalues2, rtol=0.7) ci = self.res1.conf_int() # not scale corrected assert_allclose(ci[:,0], ci_low, rtol=0.01) assert_allclose(ci[:,1], ci_upp, rtol=0.01) assert_allclose(self.res1.rsquared, self.res2.r2, rtol=2e-6) # Note: tgmixed is using k_exog for df_resid corr_fact = self.res1.df_resid / self.res2.df_r assert_allclose(np.sqrt(self.res1.mse_resid * corr_fact), self.res2.rmse, rtol=2e-6) assert_allclose(self.res1.fittedvalues, self.res_predict['fittedvalues'], atol=5e7) def test_other(self): tc = self.res1.test_compatibility() assert_allclose(np.squeeze(tc[0]), self.res2.compat, rtol=2e-6) assert_allclose(np.squeeze(tc[1]), self.res2.pvalue, rtol=2e-6) frac = self.res1.share_data() # TODO check again, I guess tgmixed uses final scale in hatmatrix # but I'm not sure, it passed in previous version, but now we override # scale with OLS scale # assert_allclose(frac, self.res2.frac_sample, rtol=2e-6) # regression tests: assert_allclose(frac, 0.6946116246864239, rtol=2e-6) def test_no_penalization(self): res_ols = OLS(self.res1.model.endog, self.res1.model.exog).fit() res_theil = self.res1.model.fit(pen_weight=0, cov_type='data-prior') assert_allclose(res_theil.params, res_ols.params, rtol=1e-10) assert_allclose(res_theil.bse, res_ols.bse, rtol=1e-10) @pytest.mark.smoke def test_summary(self): with pytest.warns(UserWarning): self.res1.summary() class CheckEquivalenceMixin(object): tol = {'default': (1e-4, 1e-20)} @classmethod def get_sample(cls): np.random.seed(987456) nobs, k_vars = 200, 5 beta = 0.5 * np.array([0.1, 1, 1, 0, 0]) x = np.random.randn(nobs, k_vars) x[:, 0] = 1 y = np.dot(x, beta) + 2 * np.random.randn(nobs) return y, x def test_attributes(self): attributes_fit = ['params', 'rsquared', 'df_resid', 'df_model', 'llf', 'aic', 'bic' #'fittedvalues', 'resid' ] attributes_inference = ['bse', 'tvalues', 'pvalues'] import copy attributes = copy.copy(attributes_fit) if not getattr(self, 'skip_inference', False): attributes.extend(attributes_inference) for att in attributes: r1 = getattr(self.res1, att) r2 = getattr(self.res2, att) if not np.size(r1) == 1: r1 = r1[:len(r2)] # check if we have overwritten tolerance rtol, atol = self.tol.get(att, self.tol['default']) message = 'attribute: ' + att #+ '\n%r\n\%r' % (r1, r2) assert_allclose(r1, r2, rtol=rtol, atol=atol, err_msg=message) # models are not close enough for some attributes at high precision assert_allclose(self.res1.fittedvalues, self.res1.fittedvalues, rtol=1e-3, atol=1e-4) assert_allclose(self.res1.resid, self.res1.resid, rtol=1e-3, atol=1e-4) class TestTheil1(CheckEquivalenceMixin): # penalize last two parameters to zero @classmethod def setup_class(cls): y, x = cls.get_sample() mod1 = TheilGLS(y, x, sigma_prior=[0, 0, 1., 1.]) cls.res1 = mod1.fit(200000) cls.res2 = OLS(y, x[:, :3]).fit() class TestTheil2(CheckEquivalenceMixin): # no penalization = same as OLS @classmethod def setup_class(cls): y, x = cls.get_sample() mod1 = TheilGLS(y, x, sigma_prior=[0, 0, 1., 1.]) cls.res1 = mod1.fit(0) cls.res2 = OLS(y, x).fit() class TestTheil3(CheckEquivalenceMixin): # perfect multicollinearity = same as OLS in terms of fit # inference: bse, ... is different @classmethod def setup_class(cls): cls.skip_inference = True y, x = cls.get_sample() xd = np.column_stack((x, x)) #sp = np.zeros(5), np.ones(5) r_matrix = np.eye(5, 10, 5) mod1 = TheilGLS(y, xd, r_matrix=r_matrix) #sigma_prior=[0, 0, 1., 1.]) cls.res1 = mod1.fit(0.001, cov_type='data-prior') cls.res2 = OLS(y, x).fit() class TestTheilGLS(CheckEquivalenceMixin): # penalize last two parameters to zero @classmethod def setup_class(cls): y, x = cls.get_sample() nobs = len(y) weights = (np.arange(nobs) < (nobs // 2)) + 0.5 mod1 = TheilGLS(y, x, sigma=weights, sigma_prior=[0, 0, 1., 1.]) cls.res1 = mod1.fit(200000) cls.res2 = GLS(y, x[:, :3], sigma=weights).fit() class TestTheilLinRestriction(CheckEquivalenceMixin): # impose linear restriction with small uncertainty - close to OLS @classmethod def setup_class(cls): y, x = cls.get_sample() #merge var1 and var2 x2 = x[:, :2].copy() x2[:, 1] += x[:, 2] #mod1 = TheilGLS(y, x, r_matrix =[[0, 1, -1, 0, 0]]) mod1 = TheilGLS(y, x[:, :3], r_matrix =[[0, 1, -1]]) cls.res1 = mod1.fit(200000) cls.res2 = OLS(y, x2).fit() # adjust precision, careful: cls.tol is mutable tol = {'pvalues': (1e-4, 2e-7), 'tvalues': (5e-4, 0)} tol.update(cls.tol) cls.tol = tol class TestTheilLinRestrictionApprox(CheckEquivalenceMixin): # impose linear restriction with some uncertainty @classmethod def setup_class(cls): y, x = cls.get_sample() #merge var1 and var2 x2 = x[:, :2].copy() x2[:, 1] += x[:, 2] #mod1 = TheilGLS(y, x, r_matrix =[[0, 1, -1, 0, 0]]) mod1 = TheilGLS(y, x[:, :3], r_matrix =[[0, 1, -1]]) cls.res1 = mod1.fit(100) cls.res2 = OLS(y, x2).fit() # adjust precision, careful: cls.tol is mutable import copy tol = copy.copy(cls.tol) tol2 = {'default': (0.15, 0), 'params': (0.05, 0), 'pvalues': (0.02, 0.001), } tol.update(tol2) cls.tol = tol class TestTheilPanel(object): @classmethod def setup_class(cls): #example 3 nobs = 300 nobs_i = 5 n_groups = nobs // nobs_i k_vars = 3 from statsmodels.sandbox.panel.random_panel import PanelSample dgp = PanelSample(nobs, k_vars, n_groups, seed=303305) # add random intercept, using same RandomState dgp.group_means = 2 + dgp.random_state.randn(n_groups) print('seed', dgp.seed) y = dgp.generate_panel() x = np.column_stack((dgp.exog[:,1:], dgp.groups[:,None] == np.arange(n_groups))) cls.dgp = dgp cls.endog = y cls.exog = x cls.res_ols = OLS(y, x).fit() def test_regression(self): y = self.endog x = self.exog n_groups, k_vars = self.dgp.n_groups, self.dgp.k_vars Rg = (np.eye(n_groups-1) - 1. / n_groups * np.ones((n_groups - 1, n_groups-1))) R = np.c_[np.zeros((n_groups - 1, k_vars)), Rg] r = np.zeros(n_groups - 1) R[:, k_vars-1] = -1 lambd = 1 #1e-4 mod = TheilGLS(y, x, r_matrix=R, q_matrix=r, sigma_prior=lambd) res = mod.fit() # regression test params1 = np.array([ 0.9751655 , 1.05215277, 0.37135028, 2.0492626 , 2.82062503, 2.82139775, 1.92940468, 2.96942081, 2.86349583, 3.20695368, 4.04516422, 3.04918839, 4.54748808, 3.49026961, 3.15529618, 4.25552932, 2.65471759, 3.62328747, 3.07283053, 3.49485898, 3.42301424, 2.94677593, 2.81549427, 2.24895113, 2.29222784, 2.89194946, 3.17052308, 2.37754241, 3.54358533, 3.79838425, 1.91189071, 1.15976407, 4.05629691, 1.58556827, 4.49941666, 4.08608599, 3.1889269 , 2.86203652, 3.06785013, 1.9376162 , 2.90657681, 3.71910592, 3.15607617, 3.58464547, 2.15466323, 4.87026717, 2.92909833, 2.64998337, 2.891171 , 4.04422964, 3.54616122, 4.12135273, 3.70232028, 3.8314497 , 2.2591451 , 2.39321422, 3.13064532, 2.1569678 , 2.04667506, 3.92064689, 3.66243644, 3.11742725]) assert_allclose(res.params, params1) pen_weight_aicc = mod.select_pen_weight(method='aicc') pen_weight_gcv = mod.select_pen_weight(method='gcv') pen_weight_cv = mod.select_pen_weight(method='cv') pen_weight_bic = mod.select_pen_weight(method='bic') assert_allclose(pen_weight_gcv, pen_weight_aicc, rtol=0.1) # regression tests: assert_allclose(pen_weight_aicc, 4.77333984, rtol=1e-4) assert_allclose(pen_weight_gcv, 4.45546875, rtol=1e-4) assert_allclose(pen_weight_bic, 9.35957031, rtol=1e-4) assert_allclose(pen_weight_cv, 1.99277344, rtol=1e-4) def test_combine_subset_regression(self): # split sample into two, use first sample as prior for second endog = self.endog exog = self.exog nobs = len(endog) n05 = nobs // 2 np.random.seed(987125) # shuffle to get random subsamples shuffle_idx = np.random.permutation(np.arange(nobs)) ys = endog[shuffle_idx] xs = exog[shuffle_idx] k = 10 res_ols0 = OLS(ys[:n05], xs[:n05, :k]).fit() res_ols1 = OLS(ys[n05:], xs[n05:, :k]).fit() w = res_ols1.scale / res_ols0.scale #1.01 mod_1 = TheilGLS(ys[n05:], xs[n05:, :k], r_matrix=np.eye(k), q_matrix=res_ols0.params, sigma_prior=w * res_ols0.cov_params()) res_1p = mod_1.fit(cov_type='data-prior') res_1s = mod_1.fit(cov_type='sandwich') res_olsf = OLS(ys, xs[:, :k]).fit() assert_allclose(res_1p.params, res_olsf.params, rtol=1e-9) corr_fact = np.sqrt(res_1p.scale / res_olsf.scale) # corrct for differences in scale computation assert_allclose(res_1p.bse, res_olsf.bse * corr_fact, rtol=1e-3) # regression test, does not verify numbers # especially why are these smaller than OLS on full sample # in larger sample, nobs=600, those were close to full OLS bse1 = np.array([ 0.26589869, 0.15224812, 0.38407399, 0.75679949, 0.66084200, 0.54174080, 0.53697607, 0.66006377, 0.38228551, 0.53920485]) assert_allclose(res_1s.bse, bse1, rtol=1e-7)