""" Test functions for models.regression """ # TODO: Test for LM from statsmodels.compat.python import lrange import warnings import numpy as np from numpy.testing import ( assert_, assert_allclose, assert_almost_equal, assert_equal, assert_raises, ) import pandas as pd import pytest from scipy.linalg import toeplitz from scipy.stats import t as student_t from statsmodels.datasets import longley from statsmodels.regression.linear_model import ( GLS, OLS, WLS, burg, yule_walker, ) from statsmodels.tools.tools import add_constant DECIMAL_4 = 4 DECIMAL_3 = 3 DECIMAL_2 = 2 DECIMAL_1 = 1 DECIMAL_7 = 7 DECIMAL_0 = 0 try: import cvxopt # noqa:F401 has_cvxopt = True except ImportError: has_cvxopt = False class CheckRegressionResults(object): """ res2 contains results from Rmodelwrap or were obtained from a statistical packages such as R, Stata, or SAS and were written to model_results """ decimal_params = DECIMAL_4 def test_params(self): assert_almost_equal( self.res1.params, self.res2.params, self.decimal_params ) decimal_standarderrors = DECIMAL_4 def test_standarderrors(self): assert_allclose( self.res1.bse, self.res2.bse, self.decimal_standarderrors ) decimal_confidenceintervals = DECIMAL_4 def test_confidenceintervals(self): # NOTE: stata rounds residuals (at least) to sig digits so approx_equal conf1 = self.res1.conf_int() conf2 = self.res2.conf_int() for i in range(len(conf1)): assert_allclose( conf1[i][0], conf2[i][0], rtol=10 ** -self.decimal_confidenceintervals, ) assert_allclose( conf1[i][1], conf2[i][1], rtol=10 ** -self.decimal_confidenceintervals, ) decimal_conf_int_subset = DECIMAL_4 def test_conf_int_subset(self): if len(self.res1.params) > 1: with pytest.warns(FutureWarning, match="cols is"): ci1 = self.res1.conf_int(cols=(1, 2)) ci2 = self.res1.conf_int()[1:3] assert_almost_equal(ci1, ci2, self.decimal_conf_int_subset) else: pass decimal_scale = DECIMAL_4 def test_scale(self): assert_almost_equal( self.res1.scale, self.res2.scale, self.decimal_scale ) decimal_rsquared = DECIMAL_4 def test_rsquared(self): assert_almost_equal( self.res1.rsquared, self.res2.rsquared, self.decimal_rsquared ) decimal_rsquared_adj = DECIMAL_4 def test_rsquared_adj(self): assert_almost_equal( self.res1.rsquared_adj, self.res2.rsquared_adj, self.decimal_rsquared_adj, ) def test_degrees(self): assert_equal(self.res1.model.df_model, self.res2.df_model) assert_equal(self.res1.model.df_resid, self.res2.df_resid) decimal_ess = DECIMAL_4 def test_ess(self): # Explained Sum of Squares assert_almost_equal(self.res1.ess, self.res2.ess, self.decimal_ess) decimal_ssr = DECIMAL_4 def test_sumof_squaredresids(self): assert_almost_equal(self.res1.ssr, self.res2.ssr, self.decimal_ssr) decimal_mse_resid = DECIMAL_4 def test_mse_resid(self): # Mean squared error of residuals assert_almost_equal( self.res1.mse_model, self.res2.mse_model, self.decimal_mse_resid ) decimal_mse_model = DECIMAL_4 def test_mse_model(self): assert_almost_equal( self.res1.mse_resid, self.res2.mse_resid, self.decimal_mse_model ) decimal_mse_total = DECIMAL_4 def test_mse_total(self): assert_almost_equal( self.res1.mse_total, self.res2.mse_total, self.decimal_mse_total, err_msg="Test class %s" % self, ) decimal_fvalue = DECIMAL_4 def test_fvalue(self): # did not change this, not sure it should complain -inf not equal -inf # if not (np.isinf(self.res1.fvalue) and np.isinf(self.res2.fvalue)): assert_almost_equal( self.res1.fvalue, self.res2.fvalue, self.decimal_fvalue ) decimal_loglike = DECIMAL_4 def test_loglike(self): assert_almost_equal(self.res1.llf, self.res2.llf, self.decimal_loglike) decimal_aic = DECIMAL_4 def test_aic(self): assert_almost_equal(self.res1.aic, self.res2.aic, self.decimal_aic) # the following just checks the definition aicc1 = self.res1.info_criteria("aicc") k = self.res1.df_model + self.res1.model.k_constant nobs = self.res1.model.nobs aicc2 = self.res1.aic + 2 * (k**2 + k) / (nobs - k - 1) assert_allclose(aicc1, aicc2, rtol=1e-10) hqic1 = self.res1.info_criteria("hqic") hqic2 = (self.res1.aic - 2 * k) + 2 * np.log(np.log(nobs)) * k assert_allclose(hqic1, hqic2, rtol=1e-10) decimal_bic = DECIMAL_4 def test_bic(self): assert_almost_equal(self.res1.bic, self.res2.bic, self.decimal_bic) decimal_pvalues = DECIMAL_4 def test_pvalues(self): assert_almost_equal( self.res1.pvalues, self.res2.pvalues, self.decimal_pvalues ) decimal_wresid = DECIMAL_4 def test_wresid(self): assert_almost_equal( self.res1.wresid, self.res2.wresid, self.decimal_wresid ) decimal_resids = DECIMAL_4 def test_resids(self): assert_almost_equal( self.res1.resid, self.res2.resid, self.decimal_resids ) decimal_norm_resids = DECIMAL_4 def test_norm_resids(self): assert_almost_equal( self.res1.resid_pearson, self.res2.resid_pearson, self.decimal_norm_resids, ) # TODO: test fittedvalues and what else? class TestOLS(CheckRegressionResults): @classmethod def setup_class(cls): from .results.results_regression import Longley data = longley.load() endog = np.asarray(data.endog) exog = np.asarray(data.exog) exog = add_constant(exog, prepend=False) res1 = OLS(endog, exog).fit() res2 = Longley() res2.wresid = res1.wresid # workaround hack cls.res1 = res1 cls.res2 = res2 res_qr = OLS(endog, exog).fit(method="qr") model_qr = OLS(endog, exog) Q, R = np.linalg.qr(exog) model_qr.exog_Q, model_qr.exog_R = Q, R model_qr.normalized_cov_params = np.linalg.inv(np.dot(R.T, R)) model_qr.rank = np.linalg.matrix_rank(R) res_qr2 = model_qr.fit(method="qr") cls.res_qr = res_qr cls.res_qr_manual = res_qr2 def test_eigenvalues(self): eigenval_perc_diff = ( self.res_qr.eigenvals - self.res_qr_manual.eigenvals ) eigenval_perc_diff /= self.res_qr.eigenvals zeros = np.zeros_like(eigenval_perc_diff) assert_almost_equal(eigenval_perc_diff, zeros, DECIMAL_7) # Robust error tests. Compare values computed with SAS def test_HC0_errors(self): # They are split up because the copied results do not have any # DECIMAL_4 places for the last place. assert_almost_equal( self.res1.HC0_se[:-1], self.res2.HC0_se[:-1], DECIMAL_4 ) assert_allclose(np.round(self.res1.HC0_se[-1]), self.res2.HC0_se[-1]) def test_HC1_errors(self): assert_almost_equal( self.res1.HC1_se[:-1], self.res2.HC1_se[:-1], DECIMAL_4 ) # Note: tolerance is tight; rtol=3e-7 fails while 4e-7 passes assert_allclose(self.res1.HC1_se[-1], self.res2.HC1_se[-1], rtol=4e-7) def test_HC2_errors(self): assert_almost_equal( self.res1.HC2_se[:-1], self.res2.HC2_se[:-1], DECIMAL_4 ) # Note: tolerance is tight; rtol=4e-7 fails while 5e-7 passes assert_allclose(self.res1.HC2_se[-1], self.res2.HC2_se[-1], rtol=5e-7) def test_HC3_errors(self): assert_almost_equal( self.res1.HC3_se[:-1], self.res2.HC3_se[:-1], DECIMAL_4 ) # Note: tolerance is tight; rtol=1e-7 fails while 1.5e-7 passes assert_allclose( self.res1.HC3_se[-1], self.res2.HC3_se[-1], rtol=1.5e-7 ) def test_qr_params(self): assert_almost_equal(self.res1.params, self.res_qr.params, 6) def test_qr_normalized_cov_params(self): # todo: need assert_close assert_almost_equal( np.ones_like(self.res1.normalized_cov_params), self.res1.normalized_cov_params / self.res_qr.normalized_cov_params, 5, ) def test_missing(self): data = longley.load() data.exog = add_constant(data.exog, prepend=False) data.endog[[3, 7, 14]] = np.nan mod = OLS(data.endog, data.exog, missing="drop") assert_equal(mod.endog.shape[0], 13) assert_equal(mod.exog.shape[0], 13) def test_rsquared_adj_overfit(self): # Test that if df_resid = 0, rsquared_adj = 0. # This is a regression test for user issue: # https://github.com/statsmodels/statsmodels/issues/868 with warnings.catch_warnings(record=True): x = np.random.randn(5) y = np.random.randn(5, 6) results = OLS(x, y).fit() rsquared_adj = results.rsquared_adj assert_equal(rsquared_adj, np.nan) def test_qr_alternatives(self): assert_allclose( self.res_qr.params, self.res_qr_manual.params, rtol=5e-12 ) def test_norm_resid(self): resid = self.res1.wresid norm_resid = resid / np.sqrt(np.sum(resid ** 2.0) / self.res1.df_resid) model_norm_resid = self.res1.resid_pearson assert_almost_equal(model_norm_resid, norm_resid, DECIMAL_7) def test_summary_slim(self): # check that slim summary is smaller, does not verify content summ = self.res1.summary(slim=True) assert len(summ.tables) == 2 assert len(str(summ)) < 6700 def test_norm_resid_zero_variance(self): with warnings.catch_warnings(record=True): y = self.res1.model.endog res = OLS(y, y).fit() assert_allclose(res.scale, 0, atol=1e-20) assert_allclose(res.wresid, res.resid_pearson, atol=5e-11) class TestRTO(CheckRegressionResults): @classmethod def setup_class(cls): from .results.results_regression import LongleyRTO data = longley.load() endog = np.asarray(data.endog) exog = np.asarray(data.exog) res1 = OLS(endog, exog).fit() res2 = LongleyRTO() res2.wresid = res1.wresid # workaround hack cls.res1 = res1 cls.res2 = res2 res_qr = OLS(endog, exog).fit(method="qr") cls.res_qr = res_qr class TestFtest(object): """ Tests f_test vs. RegressionResults """ @classmethod def setup_class(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) cls.res1 = OLS(data.endog, data.exog).fit() R = np.identity(7)[:-1, :] cls.Ftest = cls.res1.f_test(R) def test_F(self): assert_almost_equal(self.Ftest.fvalue, self.res1.fvalue, DECIMAL_4) def test_p(self): assert_almost_equal(self.Ftest.pvalue, self.res1.f_pvalue, DECIMAL_4) def test_Df_denom(self): assert_equal(self.Ftest.df_denom, self.res1.model.df_resid) def test_Df_num(self): assert_equal(self.Ftest.df_num, 6) class TestFTest2(object): """ A joint test that the coefficient on GNP = the coefficient on UNEMP and that the coefficient on POP = the coefficient on YEAR for the Longley dataset. Ftest1 is from statsmodels. Results are from Rpy using R's car library. """ @classmethod def setup_class(cls): data = longley.load() columns = [f"x{i}" for i in range(1, data.exog.shape[1] + 1)] data.exog.columns = columns data.exog = add_constant(data.exog, prepend=False) res1 = OLS(data.endog, data.exog).fit() R2 = [[0, 1, -1, 0, 0, 0, 0], [0, 0, 0, 0, 1, -1, 0]] cls.Ftest1 = res1.f_test(R2) hyp = "x2 = x3, x5 = x6" cls.NewFtest1 = res1.f_test(hyp) def test_new_ftest(self): assert_equal(self.NewFtest1.fvalue, self.Ftest1.fvalue) def test_fvalue(self): assert_almost_equal(self.Ftest1.fvalue, 9.7404618732968196, DECIMAL_4) def test_pvalue(self): assert_almost_equal( self.Ftest1.pvalue, 0.0056052885317493459, DECIMAL_4 ) def test_df_denom(self): assert_equal(self.Ftest1.df_denom, 9) def test_df_num(self): assert_equal(self.Ftest1.df_num, 2) class TestFtestQ(object): """ A joint hypothesis test that Rb = q. Coefficient tests are essentially made up. Test values taken from Stata. """ @classmethod def setup_class(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) res1 = OLS(data.endog, data.exog).fit() R = np.array( [ [0, 1, 1, 0, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 0], ] ) q = np.array([0, 0, 0, 1, 0]) cls.Ftest1 = res1.f_test((R, q)) def test_fvalue(self): assert_almost_equal(self.Ftest1.fvalue, 70.115557, 5) def test_pvalue(self): assert_almost_equal(self.Ftest1.pvalue, 6.229e-07, 10) def test_df_denom(self): assert_equal(self.Ftest1.df_denom, 9) def test_df_num(self): assert_equal(self.Ftest1.df_num, 5) class TestTtest(object): """ Test individual t-tests. Ie., are the coefficients significantly different than zero. """ @classmethod def setup_class(cls): data = longley.load() columns = [f"x{i}" for i in range(1, data.exog.shape[1] + 1)] data.exog.columns = columns data.exog = add_constant(data.exog, prepend=False) cls.res1 = OLS(data.endog, data.exog).fit() R = np.identity(7) cls.Ttest = cls.res1.t_test(R) hyp = "x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0, x6 = 0, const = 0" cls.NewTTest = cls.res1.t_test(hyp) def test_new_tvalue(self): assert_equal(self.NewTTest.tvalue, self.Ttest.tvalue) def test_tvalue(self): assert_almost_equal(self.Ttest.tvalue, self.res1.tvalues, DECIMAL_4) def test_sd(self): assert_almost_equal(self.Ttest.sd, self.res1.bse, DECIMAL_4) def test_pvalue(self): assert_almost_equal( self.Ttest.pvalue, student_t.sf(np.abs(self.res1.tvalues), self.res1.model.df_resid) * 2, DECIMAL_4, ) def test_df_denom(self): assert_equal(self.Ttest.df_denom, self.res1.model.df_resid) def test_effect(self): assert_almost_equal(self.Ttest.effect, self.res1.params) class TestTtest2(object): """ Tests the hypothesis that the coefficients on POP and YEAR are equal. Results from RPy using 'car' package. """ @classmethod def setup_class(cls): R = np.zeros(7) R[4:6] = [1, -1] data = longley.load() data.exog = add_constant(data.exog, prepend=False) res1 = OLS(data.endog, data.exog).fit() cls.Ttest1 = res1.t_test(R) def test_tvalue(self): assert_almost_equal(self.Ttest1.tvalue, -4.0167754636397284, DECIMAL_4) def test_sd(self): assert_almost_equal(self.Ttest1.sd, 455.39079425195314, DECIMAL_4) def test_pvalue(self): assert_almost_equal( self.Ttest1.pvalue, 2 * 0.0015163772380932246, DECIMAL_4 ) def test_df_denom(self): assert_equal(self.Ttest1.df_denom, 9) def test_effect(self): assert_almost_equal(self.Ttest1.effect, -1829.2025687186533, DECIMAL_4) class TestGLS(object): """ These test results were obtained by replication with R. """ @classmethod def setup_class(cls): from .results.results_regression import LongleyGls data = longley.load() exog = add_constant( np.column_stack((data.exog.iloc[:, 1], data.exog.iloc[:, 4])), prepend=False, ) tmp_results = OLS(data.endog, exog).fit() rho = np.corrcoef(tmp_results.resid[1:], tmp_results.resid[:-1])[0][ 1 ] # by assumption order = toeplitz(np.arange(16)) sigma = rho ** order GLS_results = GLS(data.endog, exog, sigma=sigma).fit() cls.res1 = GLS_results cls.res2 = LongleyGls() # attach for test_missing cls.sigma = sigma cls.exog = exog cls.endog = data.endog def test_aic(self): # Note: tolerance is tight; rtol=3e-3 fails while 4e-3 passes assert_allclose(self.res1.aic + 2, self.res2.aic, rtol=4e-3) def test_bic(self): # Note: tolerance is tight; rtol=1e-2 fails while 1.5e-2 passes assert_allclose(self.res1.bic, self.res2.bic, rtol=1.5e-2) def test_loglike(self): assert_almost_equal(self.res1.llf, self.res2.llf, DECIMAL_0) def test_params(self): assert_almost_equal(self.res1.params, self.res2.params, DECIMAL_1) def test_resid(self): assert_almost_equal(self.res1.resid, self.res2.resid, DECIMAL_4) def test_scale(self): assert_almost_equal(self.res1.scale, self.res2.scale, DECIMAL_4) def test_tvalues(self): assert_almost_equal(self.res1.tvalues, self.res2.tvalues, DECIMAL_4) def test_standarderrors(self): assert_almost_equal(self.res1.bse, self.res2.bse, DECIMAL_4) def test_fittedvalues(self): assert_almost_equal( self.res1.fittedvalues, self.res2.fittedvalues, DECIMAL_4 ) def test_pvalues(self): assert_almost_equal(self.res1.pvalues, self.res2.pvalues, DECIMAL_4) def test_missing(self): endog = self.endog.copy() # copy or changes endog for other methods endog[[4, 7, 14]] = np.nan mod = GLS(endog, self.exog, sigma=self.sigma, missing="drop") assert_equal(mod.endog.shape[0], 13) assert_equal(mod.exog.shape[0], 13) assert_equal(mod.sigma.shape, (13, 13)) class TestGLS_alt_sigma(CheckRegressionResults): """ Test that GLS with no argument is equivalent to OLS. """ @classmethod def setup_class(cls): data = longley.load() endog = np.asarray(data.endog) exog = np.asarray(data.exog) exog = add_constant(exog, prepend=False) ols_res = OLS(endog, exog).fit() gls_res = GLS(endog, exog).fit() gls_res_scalar = GLS(endog, exog, sigma=1) cls.endog = endog cls.exog = exog cls.res1 = gls_res cls.res2 = ols_res cls.res3 = gls_res_scalar # self.res2.conf_int = self.res2.conf_int() def test_wrong_size_sigma_1d(self): n = len(self.endog) assert_raises( ValueError, GLS, self.endog, self.exog, sigma=np.ones(n - 1) ) def test_wrong_size_sigma_2d(self): n = len(self.endog) assert_raises( ValueError, GLS, self.endog, self.exog, sigma=np.ones((n - 1, n - 1)), ) # FIXME: do not leave commented-out, use or move/remove # def check_confidenceintervals(self, conf1, conf2): # assert_almost_equal(conf1, conf2, DECIMAL_4) class TestLM(object): @classmethod def setup_class(cls): # TODO: Test HAC method rs = np.random.RandomState(1234) x = rs.randn(100, 3) b = np.ones((3, 1)) e = rs.randn(100, 1) y = np.dot(x, b) + e # Cases? # Homoskedastic # HC0 cls.res1_full = OLS(y, x).fit() cls.res1_restricted = OLS(y, x[:, 0]).fit() cls.res2_full = cls.res1_full.get_robustcov_results("HC0") cls.res2_restricted = cls.res1_restricted.get_robustcov_results("HC0") cls.x = x cls.Y = y def test_LM_homoskedastic(self): resid = self.res1_restricted.wresid n = resid.shape[0] x = self.x S = np.dot(resid, resid) / n * np.dot(x.T, x) / n Sinv = np.linalg.inv(S) s = np.mean(x * resid[:, None], 0) LMstat = n * np.dot(np.dot(s, Sinv), s.T) LMstat_OLS = self.res1_full.compare_lm_test(self.res1_restricted) LMstat2 = LMstat_OLS[0] assert_almost_equal(LMstat, LMstat2, DECIMAL_7) def test_LM_heteroskedastic_nodemean(self): resid = self.res1_restricted.wresid n = resid.shape[0] x = self.x scores = x * resid[:, None] S = np.dot(scores.T, scores) / n Sinv = np.linalg.inv(S) s = np.mean(scores, 0) LMstat = n * np.dot(np.dot(s, Sinv), s.T) LMstat_OLS = self.res2_full.compare_lm_test( self.res2_restricted, demean=False ) LMstat2 = LMstat_OLS[0] assert_almost_equal(LMstat, LMstat2, DECIMAL_7) def test_LM_heteroskedastic_demean(self): resid = self.res1_restricted.wresid n = resid.shape[0] x = self.x scores = x * resid[:, None] scores_demean = scores - scores.mean(0) S = np.dot(scores_demean.T, scores_demean) / n Sinv = np.linalg.inv(S) s = np.mean(scores, 0) LMstat = n * np.dot(np.dot(s, Sinv), s.T) LMstat_OLS = self.res2_full.compare_lm_test(self.res2_restricted) LMstat2 = LMstat_OLS[0] assert_almost_equal(LMstat, LMstat2, DECIMAL_7) def test_LM_heteroskedastic_LRversion(self): resid = self.res1_restricted.wresid resid_full = self.res1_full.wresid n = resid.shape[0] x = self.x scores = x * resid[:, None] s = np.mean(scores, 0) scores = x * resid_full[:, None] S = np.dot(scores.T, scores) / n Sinv = np.linalg.inv(S) LMstat = n * np.dot(np.dot(s, Sinv), s.T) LMstat_OLS = self.res2_full.compare_lm_test( self.res2_restricted, use_lr=True ) LMstat2 = LMstat_OLS[0] assert_almost_equal(LMstat, LMstat2, DECIMAL_7) def test_LM_nonnested(self): assert_raises( ValueError, self.res2_restricted.compare_lm_test, self.res2_full ) class TestOLS_GLS_WLS_equivalence(object): @classmethod def setup_class(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) y = data.endog x = data.exog n = y.shape[0] w = np.ones(n) cls.results = [] cls.results.append(OLS(y, x).fit()) cls.results.append(WLS(y, x, w).fit()) # scaling weights does not change main results (except scale) cls.results.append(GLS(y, x, 100 * w).fit()) cls.results.append(GLS(y, x, np.diag(0.1 * w)).fit()) def test_ll(self): llf = np.array([r.llf for r in self.results]) llf_1 = np.ones_like(llf) * self.results[0].llf assert_almost_equal(llf, llf_1, DECIMAL_7) ic = np.array([r.aic for r in self.results]) ic_1 = np.ones_like(ic) * self.results[0].aic assert_almost_equal(ic, ic_1, DECIMAL_7) ic = np.array([r.bic for r in self.results]) ic_1 = np.ones_like(ic) * self.results[0].bic assert_almost_equal(ic, ic_1, DECIMAL_7) def test_params(self): params = np.array([r.params for r in self.results]) params_1 = np.array([self.results[0].params] * len(self.results)) assert_allclose(params, params_1) def test_ss(self): bse = np.array([r.bse for r in self.results]) bse_1 = np.array([self.results[0].bse] * len(self.results)) assert_allclose(bse, bse_1) def test_rsquared(self): rsquared = np.array([r.rsquared for r in self.results]) rsquared_1 = np.array([self.results[0].rsquared] * len(self.results)) assert_almost_equal(rsquared, rsquared_1, DECIMAL_7) class TestGLS_WLS_equivalence(TestOLS_GLS_WLS_equivalence): # reuse test methods @classmethod def setup_class(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) y = data.endog x = data.exog n = y.shape[0] np.random.seed(5) w = np.random.uniform(0.5, 1, n) w_inv = 1.0 / w cls.results = [] cls.results.append(WLS(y, x, w).fit()) # scaling weights does not change main results (except scale) cls.results.append(WLS(y, x, 0.01 * w).fit()) cls.results.append(GLS(y, x, 100 * w_inv).fit()) cls.results.append(GLS(y, x, np.diag(0.1 * w_inv)).fit()) class TestNonFit(object): @classmethod def setup_class(cls): data = longley.load() data.exog = add_constant(data.exog, prepend=False) cls.endog = data.endog cls.exog = data.exog cls.ols_model = OLS(data.endog, data.exog) def test_df_resid(self): df_resid = self.endog.shape[0] - self.exog.shape[1] assert_equal(self.ols_model.df_resid, 9) class TestWLS_CornerCases(object): @classmethod def setup_class(cls): cls.exog = np.ones((1,)) cls.endog = np.ones((1,)) weights = 1 cls.wls_res = WLS(cls.endog, cls.exog, weights=weights).fit() def test_wrong_size_weights(self): weights = np.ones((10, 10)) assert_raises(ValueError, WLS, self.endog, self.exog, weights=weights) class TestWLSExogWeights(CheckRegressionResults): # Test WLS with Greene's credit card data # reg avgexp age income incomesq ownrent [aw=1/incomesq] @classmethod def setup_class(cls): from statsmodels.datasets.ccard import load from .results.results_regression import CCardWLS dta = load() endog = np.asarray(dta.endog) exog = np.asarray(dta.exog) exog = add_constant(exog, prepend=False) nobs = 72.0 weights = 1 / exog[:, 2] # for comparison with stata analytic weights scaled_weights = (weights * nobs) / weights.sum() cls.res1 = WLS(endog, exog, weights=scaled_weights).fit() cls.res2 = CCardWLS() cls.res2.wresid = scaled_weights ** 0.5 * cls.res2.resid # correction because we use different definition for loglike/llf corr_ic = 2 * (cls.res1.llf - cls.res2.llf) cls.res2.aic -= corr_ic cls.res2.bic -= corr_ic cls.res2.llf += 0.5 * np.sum(np.log(cls.res1.model.weights)) def test_wls_example(): # example from the docstring, there was a note about a bug, should # be fixed now Y = [1, 3, 4, 5, 2, 3, 4] x = lrange(1, 8) x = add_constant(x, prepend=False) wls_model = WLS(Y, x, weights=lrange(1, 8)).fit() # taken from R lm.summary assert_almost_equal(wls_model.fvalue, 0.127337843215, 6) assert_almost_equal(wls_model.scale, 2.44608530786 ** 2, 6) def test_wls_tss(): y = np.array([22, 22, 22, 23, 23, 23]) x = [[1, 0], [1, 0], [1, 1], [0, 1], [0, 1], [0, 1]] ols_mod = OLS(y, add_constant(x, prepend=False)).fit() yw = np.array([22, 22, 23.0]) Xw = [[1, 0], [1, 1], [0, 1]] w = np.array([2, 1, 3.0]) wls_mod = WLS(yw, add_constant(Xw, prepend=False), weights=w).fit() assert_equal(ols_mod.centered_tss, wls_mod.centered_tss) class TestWLSScalarVsArray(CheckRegressionResults): @classmethod def setup_class(cls): from statsmodels.datasets.longley import load dta = load() endog = np.asarray(dta.endog) exog = np.asarray(dta.exog) exog = add_constant(exog, prepend=True) wls_scalar = WLS(endog, exog, weights=1.0 / 3).fit() weights = [1 / 3.0] * len(endog) wls_array = WLS(endog, exog, weights=weights).fit() cls.res1 = wls_scalar cls.res2 = wls_array class TestWLS_GLS(CheckRegressionResults): @classmethod def setup_class(cls): from statsmodels.datasets.ccard import load data = load() endog = np.asarray(data.endog) exog = np.asarray(data.exog) sigma = exog[:, 2] cls.res1 = WLS(endog, exog, weights=1 / sigma).fit() cls.res2 = GLS(endog, exog, sigma=sigma).fit() def check_confidenceintervals(self, conf1, conf2): # FIXME: never called assert_almost_equal(conf1, conf2(), DECIMAL_4) def test_wls_missing(): from statsmodels.datasets.ccard import load data = load() endog = data.endog endog[[10, 25]] = np.nan mod = WLS( data.endog, data.exog, weights=1 / data.exog.iloc[:, 2], missing="drop" ) assert_equal(mod.endog.shape[0], 70) assert_equal(mod.exog.shape[0], 70) assert_equal(mod.weights.shape[0], 70) class TestWLS_OLS(CheckRegressionResults): @classmethod def setup_class(cls): data = longley.load() endog = np.asarray(data.endog) exog = np.asarray(data.exog) exog = add_constant(exog, prepend=False) cls.res1 = OLS(endog, exog).fit() cls.res2 = WLS(endog, exog).fit() def check_confidenceintervals(self, conf1, conf2): # FIXME: never called assert_almost_equal(conf1, conf2(), DECIMAL_4) class TestGLS_OLS(CheckRegressionResults): @classmethod def setup_class(cls): data = longley.load() endog = np.asarray(data.endog) exog = np.asarray(data.exog) exog = add_constant(exog, prepend=False) cls.res1 = GLS(endog, exog).fit() cls.res2 = OLS(endog, exog).fit() def check_confidenceintervals(self, conf1, conf2): # FIXME: never called assert_almost_equal(conf1, conf2(), DECIMAL_4) # FIXME: do not leave this commented-out sitting here # TODO: test AR # why the two-stage in AR? # class TestAR(object): # from statsmodels.datasets.sunspots import load # data = load() # model = AR(data.endog, rho=4).fit() # R_res = RModel(data.endog, aic="FALSE", order_max=4)# # def test_params(self): # assert_almost_equal(self.model.rho, # pass # def test_order(self): # In R this can be defined or chosen by minimizing the AIC if aic=True # pass class TestYuleWalker(object): @classmethod def setup_class(cls): from statsmodels.datasets.sunspots import load data = load() cls.rho, cls.sigma = yule_walker(data.endog, order=4, method="mle") cls.R_params = [ 1.2831003105694765, -0.45240924374091945, -0.20770298557575195, 0.047943648089542337, ] def test_params(self): assert_almost_equal(self.rho, self.R_params, DECIMAL_4) class TestDataDimensions(CheckRegressionResults): @classmethod def setup_class(cls): np.random.seed(54321) cls.endog_n_ = np.random.uniform(0, 20, size=30) cls.endog_n_one = cls.endog_n_[:, None] cls.exog_n_ = np.random.uniform(0, 20, size=30) cls.exog_n_one = cls.exog_n_[:, None] cls.degen_exog = cls.exog_n_one[:-1] cls.mod1 = OLS(cls.endog_n_one, cls.exog_n_one) cls.mod1.df_model += 1 cls.res1 = cls.mod1.fit() # Note that these are created for every subclass.. # A little extra overhead probably cls.mod2 = OLS(cls.endog_n_one, cls.exog_n_one) cls.mod2.df_model += 1 cls.res2 = cls.mod2.fit() def check_confidenceintervals(self, conf1, conf2): # FIXME: never called assert_almost_equal(conf1, conf2(), DECIMAL_4) class TestGLS_large_data(TestDataDimensions): @classmethod def setup_class(cls): super(TestGLS_large_data, cls).setup_class() nobs = 1000 y = np.random.randn(nobs, 1) x = np.random.randn(nobs, 20) sigma = np.ones_like(y) cls.gls_res = GLS(y, x, sigma=sigma).fit() cls.gls_res_scalar = GLS(y, x, sigma=1).fit() cls.gls_res_none = GLS(y, x).fit() cls.ols_res = OLS(y, x).fit() def test_large_equal_params(self): assert_almost_equal( self.ols_res.params, self.gls_res.params, DECIMAL_7 ) def test_large_equal_loglike(self): assert_almost_equal(self.ols_res.llf, self.gls_res.llf, DECIMAL_7) def test_large_equal_params_none(self): assert_almost_equal( self.gls_res.params, self.gls_res_none.params, DECIMAL_7 ) class TestNxNx(TestDataDimensions): @classmethod def setup_class(cls): super(TestNxNx, cls).setup_class() cls.mod2 = OLS(cls.endog_n_, cls.exog_n_) cls.mod2.df_model += 1 cls.res2 = cls.mod2.fit() class TestNxOneNx(TestDataDimensions): @classmethod def setup_class(cls): super(TestNxOneNx, cls).setup_class() cls.mod2 = OLS(cls.endog_n_one, cls.exog_n_) cls.mod2.df_model += 1 cls.res2 = cls.mod2.fit() class TestNxNxOne(TestDataDimensions): @classmethod def setup_class(cls): super(TestNxNxOne, cls).setup_class() cls.mod2 = OLS(cls.endog_n_, cls.exog_n_one) cls.mod2.df_model += 1 cls.res2 = cls.mod2.fit() def test_bad_size(): np.random.seed(54321) data = np.random.uniform(0, 20, 31) assert_raises(ValueError, OLS, data, data[1:]) def test_const_indicator(): rs = np.random.RandomState(12345) x = rs.randint(0, 3, size=30) x = pd.get_dummies(pd.Series(x, dtype="category"), drop_first=False) y = np.dot(x, [1.0, 2.0, 3.0]) + rs.normal(size=30) resc = OLS(y, add_constant(x.iloc[:, 1:], prepend=True)).fit() res = OLS(y, x, hasconst=True).fit() assert_almost_equal(resc.rsquared, res.rsquared, 12) assert res.model.data.k_constant == 1 assert resc.model.data.k_constant == 1 def test_fvalue_const_only(): rs = np.random.RandomState(12345) x = rs.randint(0, 3, size=30) x = pd.get_dummies(pd.Series(x, dtype="category"), drop_first=False) x.iloc[:, 0] = 1 y = np.dot(x, [1.0, 2.0, 3.0]) + rs.normal(size=30) res = OLS(y, x, hasconst=True).fit(cov_type="HC1") assert not np.isnan(res.fvalue) assert isinstance(res.fvalue, float) assert isinstance(res.f_pvalue, float) def test_conf_int_single_regressor(): # GH#706 single-regressor model (i.e. no intercept) with 1D exog # should get passed to DataFrame for conf_int y = pd.Series(np.random.randn(10)) x = pd.Series(np.ones(10)) res = OLS(y, x).fit() conf_int = res.conf_int() np.testing.assert_equal(conf_int.shape, (1, 2)) np.testing.assert_(isinstance(conf_int, pd.DataFrame)) def test_summary_as_latex(): # GH#734 import re dta = longley.load_pandas() x = dta.exog x["constant"] = 1 y = dta.endog res = OLS(y, x).fit() with pytest.warns(UserWarning): table = res.summary().as_latex() # replace the date and time table = re.sub( "(?<=\n\\\\textbf\\{Date:\\} &).+?&", " Sun, 07 Apr 2013 &", table, ) table = re.sub( "(?<=\n\\\\textbf\\{Time:\\} &).+?&", " 13:46:07 &", table, ) expected = """\\begin{center} \\begin{tabular}{lclc} \\toprule \\textbf{Dep. Variable:} & TOTEMP & \\textbf{ R-squared: } & 0.995 \\\\ \\textbf{Model:} & OLS & \\textbf{ Adj. R-squared: } & 0.992 \\\\ \\textbf{Method:} & Least Squares & \\textbf{ F-statistic: } & 330.3 \\\\ \\textbf{Date:} & Sun, 07 Apr 2013 & \\textbf{ Prob (F-statistic):} & 4.98e-10 \\\\ \\textbf{Time:} & 13:46:07 & \\textbf{ Log-Likelihood: } & -109.62 \\\\ \\textbf{No. Observations:} & 16 & \\textbf{ AIC: } & 233.2 \\\\ \\textbf{Df Residuals:} & 9 & \\textbf{ BIC: } & 238.6 \\\\ \\textbf{Df Model:} & 6 & \\textbf{ } & \\\\ \\textbf{Covariance Type:} & nonrobust & \\textbf{ } & \\\\ \\bottomrule \\end{tabular} \\begin{tabular}{lcccccc} & \\textbf{coef} & \\textbf{std err} & \\textbf{t} & \\textbf{P$> |$t$|$} & \\textbf{[0.025} & \\textbf{0.975]} \\\\ \\midrule \\textbf{GNPDEFL} & 15.0619 & 84.915 & 0.177 & 0.863 & -177.029 & 207.153 \\\\ \\textbf{GNP} & -0.0358 & 0.033 & -1.070 & 0.313 & -0.112 & 0.040 \\\\ \\textbf{UNEMP} & -2.0202 & 0.488 & -4.136 & 0.003 & -3.125 & -0.915 \\\\ \\textbf{ARMED} & -1.0332 & 0.214 & -4.822 & 0.001 & -1.518 & -0.549 \\\\ \\textbf{POP} & -0.0511 & 0.226 & -0.226 & 0.826 & -0.563 & 0.460 \\\\ \\textbf{YEAR} & 1829.1515 & 455.478 & 4.016 & 0.003 & 798.788 & 2859.515 \\\\ \\textbf{constant} & -3.482e+06 & 8.9e+05 & -3.911 & 0.004 & -5.5e+06 & -1.47e+06 \\\\ \\bottomrule \\end{tabular} \\begin{tabular}{lclc} \\textbf{Omnibus:} & 0.749 & \\textbf{ Durbin-Watson: } & 2.559 \\\\ \\textbf{Prob(Omnibus):} & 0.688 & \\textbf{ Jarque-Bera (JB): } & 0.684 \\\\ \\textbf{Skew:} & 0.420 & \\textbf{ Prob(JB): } & 0.710 \\\\ \\textbf{Kurtosis:} & 2.434 & \\textbf{ Cond. No. } & 4.86e+09 \\\\ \\bottomrule \\end{tabular} %\\caption{OLS Regression Results} \\end{center} Notes: \\newline [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. \\newline [2] The condition number is large, 4.86e+09. This might indicate that there are \\newline strong multicollinearity or other numerical problems.""" assert_equal(table, expected) class TestRegularizedFit(object): # Make sure there are no problems when no variables are selected. def test_empty_model(self): np.random.seed(742) n = 100 endog = np.random.normal(size=n) exog = np.random.normal(size=(n, 3)) for cls in OLS, WLS, GLS: model = cls(endog, exog) result = model.fit_regularized(alpha=1000) assert_equal(result.params, 0.0) def test_regularized(self): import os from .results import glmnet_r_results cur_dir = os.path.dirname(os.path.abspath(__file__)) data = np.loadtxt( os.path.join(cur_dir, "results", "lasso_data.csv"), delimiter="," ) tests = [x for x in dir(glmnet_r_results) if x.startswith("rslt_")] for test in tests: vec = getattr(glmnet_r_results, test) n = vec[0] p = vec[1] L1_wt = float(vec[2]) lam = float(vec[3]) params = vec[4:].astype(np.float64) endog = data[0 : int(n), 0] exog = data[0 : int(n), 1 : (int(p) + 1)] endog = endog - endog.mean() endog /= endog.std(ddof=1) exog = exog - exog.mean(0) exog /= exog.std(0, ddof=1) for cls in OLS, WLS, GLS: mod = cls(endog, exog) rslt = mod.fit_regularized(L1_wt=L1_wt, alpha=lam) assert_almost_equal(rslt.params, params, decimal=3) # Smoke test for profile likelihood mod.fit_regularized(L1_wt=L1_wt, alpha=lam, profile_scale=True) def test_regularized_weights(self): np.random.seed(1432) exog1 = np.random.normal(size=(100, 3)) endog1 = exog1[:, 0] + exog1[:, 1] + np.random.normal(size=100) exog2 = np.random.normal(size=(100, 3)) endog2 = exog2[:, 0] + exog2[:, 1] + np.random.normal(size=100) exog_a = np.vstack((exog1, exog1, exog2)) endog_a = np.concatenate((endog1, endog1, endog2)) # Should be equivalent to exog_a, endog_a. exog_b = np.vstack((exog1, exog2)) endog_b = np.concatenate((endog1, endog2)) wgts = np.ones(200) wgts[0:100] = 2 sigma = np.diag(1 / wgts) for L1_wt in 0, 0.5, 1: for alpha in 0, 1: mod1 = OLS(endog_a, exog_a) rslt1 = mod1.fit_regularized(L1_wt=L1_wt, alpha=alpha) mod2 = WLS(endog_b, exog_b, weights=wgts) rslt2 = mod2.fit_regularized(L1_wt=L1_wt, alpha=alpha) mod3 = GLS(endog_b, exog_b, sigma=sigma) rslt3 = mod3.fit_regularized(L1_wt=L1_wt, alpha=alpha) assert_almost_equal(rslt1.params, rslt2.params, decimal=3) assert_almost_equal(rslt1.params, rslt3.params, decimal=3) def test_regularized_weights_list(self): np.random.seed(132) exog1 = np.random.normal(size=(100, 3)) endog1 = exog1[:, 0] + exog1[:, 1] + np.random.normal(size=100) exog2 = np.random.normal(size=(100, 3)) endog2 = exog2[:, 0] + exog2[:, 1] + np.random.normal(size=100) exog_a = np.vstack((exog1, exog1, exog2)) endog_a = np.concatenate((endog1, endog1, endog2)) # Should be equivalent to exog_a, endog_a. exog_b = np.vstack((exog1, exog2)) endog_b = np.concatenate((endog1, endog2)) wgts = np.ones(200) wgts[0:100] = 2 sigma = np.diag(1 / wgts) for L1_wt in 0, 0.5, 1: for alpha_element in 0, 1: alpha = [ alpha_element, ] * 3 mod1 = OLS(endog_a, exog_a) rslt1 = mod1.fit_regularized(L1_wt=L1_wt, alpha=alpha) mod2 = WLS(endog_b, exog_b, weights=wgts) rslt2 = mod2.fit_regularized(L1_wt=L1_wt, alpha=alpha) mod3 = GLS(endog_b, exog_b, sigma=sigma) rslt3 = mod3.fit_regularized(L1_wt=L1_wt, alpha=alpha) assert_almost_equal(rslt1.params, rslt2.params, decimal=3) assert_almost_equal(rslt1.params, rslt3.params, decimal=3) def test_formula_missing_cat(): # gh-805 from patsy import PatsyError import statsmodels.api as sm from statsmodels.formula.api import ols dta = sm.datasets.grunfeld.load_pandas().data dta.loc[dta.index[0], "firm"] = np.nan mod = ols( formula="value ~ invest + capital + firm + year", data=dta.dropna() ) res = mod.fit() mod2 = ols(formula="value ~ invest + capital + firm + year", data=dta) res2 = mod2.fit() assert_almost_equal(res.params.values, res2.params.values) assert_raises( PatsyError, ols, "value ~ invest + capital + firm + year", data=dta, missing="raise", ) def test_missing_formula_predict(): # see 2171 nsample = 30 data = np.linspace(0, 10, nsample) null = np.array([np.nan]) data = pd.DataFrame({"x": np.concatenate((data, null))}) beta = np.array([1, 0.1]) e = np.random.normal(size=nsample + 1) data["y"] = beta[0] + beta[1] * data["x"] + e model = OLS.from_formula("y ~ x", data=data) fit = model.fit() fit.predict(exog=data[:-1]) def test_fvalue_implicit_constant(): # if constant is implicit, return nan see #2444 nobs = 100 np.random.seed(2) x = np.random.randn(nobs, 1) x = ((x > 0) == [True, False]).astype(int) y = x.sum(1) + np.random.randn(nobs) from statsmodels.regression.linear_model import OLS, WLS res = OLS(y, x).fit(cov_type="HC1") assert_(np.isnan(res.fvalue)) assert_(np.isnan(res.f_pvalue)) res.summary() res = WLS(y, x).fit(cov_type="HC1") assert_(np.isnan(res.fvalue)) assert_(np.isnan(res.f_pvalue)) res.summary() def test_fvalue_only_constant(): # if only constant in model, return nan see #3642 nobs = 20 np.random.seed(2) x = np.ones(nobs) y = np.random.randn(nobs) from statsmodels.regression.linear_model import OLS, WLS res = OLS(y, x).fit(cov_type="hac", cov_kwds={"maxlags": 3}) assert_(np.isnan(res.fvalue)) assert_(np.isnan(res.f_pvalue)) res.summary() res = WLS(y, x).fit(cov_type="HC1") assert_(np.isnan(res.fvalue)) assert_(np.isnan(res.f_pvalue)) res.summary() def test_ridge(): n = 100 p = 5 np.random.seed(3132) xmat = np.random.normal(size=(n, p)) yvec = xmat.sum(1) + np.random.normal(size=n) v = np.ones(p) v[0] = 0 for a in (0, 1, 10): for alpha in (a, a * np.ones(p), a * v): model1 = OLS(yvec, xmat) result1 = model1._fit_ridge(alpha=alpha) model2 = OLS(yvec, xmat) result2 = model2.fit_regularized(alpha=alpha, L1_wt=0) assert_allclose(result1.params, result2.params) model3 = OLS(yvec, xmat) result3 = model3.fit_regularized(alpha=alpha, L1_wt=1e-10) assert_allclose(result1.params, result3.params) fv1 = result1.fittedvalues fv2 = np.dot(xmat, result1.params) assert_allclose(fv1, fv2) def test_regularized_refit(): n = 100 p = 5 np.random.seed(3132) xmat = np.random.normal(size=(n, p)) # covariates 0 and 2 matter yvec = xmat[:, 0] + xmat[:, 2] + np.random.normal(size=n) model1 = OLS(yvec, xmat) result1 = model1.fit_regularized(alpha=2.0, L1_wt=0.5, refit=True) model2 = OLS(yvec, xmat[:, [0, 2]]) result2 = model2.fit() ii = [0, 2] assert_allclose(result1.params[ii], result2.params) assert_allclose(result1.bse[ii], result2.bse) def test_regularized_predict(): # this also compares WLS with GLS n = 100 p = 5 np.random.seed(3132) xmat = np.random.normal(size=(n, p)) yvec = xmat.sum(1) + np.random.normal(size=n) wgt = np.random.uniform(1, 2, n) model_wls = WLS(yvec, xmat, weights=wgt) # TODO: params is not the same in GLS if sigma=1 / wgt, i.e 1-dim, #7755 model_gls1 = GLS(yvec, xmat, sigma=np.diag(1 / wgt)) model_gls2 = GLS(yvec, xmat, sigma=1 / wgt) res = [] for model1 in [model_wls, model_gls1, model_gls2]: result1 = model1.fit_regularized(alpha=20.0, L1_wt=0.5, refit=True) res.append(result1) params = result1.params fittedvalues = np.dot(xmat, params) pr = model1.predict(result1.params) assert_allclose(fittedvalues, pr) assert_allclose(result1.fittedvalues, pr) pr = result1.predict() assert_allclose(fittedvalues, pr) assert_allclose(res[0].model.wendog, res[1].model.wendog, rtol=1e-10) assert_allclose(res[0].model.wexog, res[1].model.wexog, rtol=1e-10) assert_allclose(res[0].fittedvalues, res[1].fittedvalues, rtol=1e-10) assert_allclose(res[0].params, res[1].params, rtol=1e-10) assert_allclose(res[0].model.wendog, res[2].model.wendog, rtol=1e-10) assert_allclose(res[0].model.wexog, res[2].model.wexog, rtol=1e-10) assert_allclose(res[0].fittedvalues, res[2].fittedvalues, rtol=1e-10) assert_allclose(res[0].params, res[2].params, rtol=1e-10) def test_regularized_options(): n = 100 p = 5 np.random.seed(3132) xmat = np.random.normal(size=(n, p)) yvec = xmat.sum(1) + np.random.normal(size=n) model1 = OLS(yvec - 1, xmat) result1 = model1.fit_regularized(alpha=1.0, L1_wt=0.5) model2 = OLS(yvec, xmat, offset=1) result2 = model2.fit_regularized( alpha=1.0, L1_wt=0.5, start_params=np.zeros(5) ) assert_allclose(result1.params, result2.params) def test_burg(): rnd = np.random.RandomState(12345) e = rnd.randn(10001) y = e[1:] + 0.5 * e[:-1] # R, ar.burg expected = [ [0.3909931], [0.4602607, -0.1771582], [0.47473245, -0.21475602, 0.08168813], [0.4787017, -0.2251910, 0.1047554, -0.0485900], [0.47975462, -0.22746106, 0.10963527, -0.05896347, 0.02167001], ] for i in range(1, 6): ar, _ = burg(y, i) assert_allclose(ar, expected[i - 1], atol=1e-6) as_nodemean, _ = burg(1 + y, i, False) assert np.all(ar != as_nodemean) def test_burg_errors(): with pytest.raises(ValueError): burg(np.ones((100, 2))) with pytest.raises(ValueError): burg(np.random.randn(100), 0) with pytest.raises(ValueError): burg(np.random.randn(100), "apple") @pytest.mark.skipif(not has_cvxopt, reason="sqrt_lasso requires cvxopt") def test_sqrt_lasso(): np.random.seed(234923) # Based on the example in the Belloni paper n = 100 p = 500 ii = np.arange(p) cx = 0.5 ** np.abs(np.subtract.outer(ii, ii)) cxr = np.linalg.cholesky(cx) x = np.dot(np.random.normal(size=(n, p)), cxr.T) b = np.zeros(p) b[0:5] = [1, 1, 1, 1, 1] from scipy.stats.distributions import norm alpha = 1.1 * np.sqrt(n) * norm.ppf(1 - 0.05 / (2 * p)) # Use very low noise level for a unit test y = np.dot(x, b) + 0.25 * np.random.normal(size=n) # At low noise levels, the sqrt lasso should be around a # factor of 3 from the oracle without refit, and should # almost equal the oracle with refit. expected_oracle = {False: 3, True: 1} # Used for regression testing expected_params = { False: np.r_[ 0.87397122, 0.96051874, 0.9905915, 0.93868953, 0.90771773 ], True: np.r_[0.95114241, 1.0302987, 1.01723074, 0.97587343, 0.99846403], } for refit in False, True: rslt = OLS(y, x).fit_regularized( method="sqrt_lasso", alpha=alpha, refit=refit ) err = rslt.params - b numer = np.sqrt(np.dot(err, np.dot(cx, err))) oracle = OLS(y, x[:, 0:5]).fit() oracle_err = np.zeros(p) oracle_err[0:5] = oracle.params - b[0:5] denom = np.sqrt(np.dot(oracle_err, np.dot(cx, oracle_err))) # Check performance relative to oracle, should be around assert_allclose( numer / denom, expected_oracle[refit], rtol=0.5, atol=0.1 ) # Regression test the parameters assert_allclose( rslt.params[0:5], expected_params[refit], rtol=1e-5, atol=1e-5 ) def test_bool_regressor(reset_randomstate): exog = np.random.randint(0, 2, size=(100, 2)).astype(bool) endog = np.random.standard_normal(100) bool_res = OLS(endog, exog).fit() res = OLS(endog, exog.astype(np.double)).fit() assert_allclose(bool_res.params, res.params) def test_ols_constant(reset_randomstate): y = np.random.standard_normal((200)) x = np.ones((200, 1)) res = OLS(y, x).fit() with warnings.catch_warnings(record=True) as recording: assert np.isnan(res.fvalue) assert np.isnan(res.f_pvalue) assert len(recording) == 0 def test_summary_no_constant(): rs = np.random.RandomState(0) x = rs.standard_normal((100, 2)) y = rs.standard_normal(100) summary = OLS(y, x).fit().summary() assert "R² is computed " in summary.as_text() def test_condition_number(reset_randomstate): y = np.random.standard_normal(100) x = np.random.standard_normal((100, 1)) x = x + np.random.standard_normal((100, 5)) res = OLS(y, x).fit() assert_allclose(res.condition_number, np.sqrt(np.linalg.cond(x.T @ x))) assert_allclose(res.condition_number, np.linalg.cond(x)) def test_slim_summary(reset_randomstate): y = np.random.standard_normal(100) x = np.random.standard_normal((100, 1)) x = x + np.random.standard_normal((100, 5)) res = OLS(y, x).fit() summ = res.summary() summ2 = res.summary() slim_summ = res.summary(slim=True) assert str(summ) == str(summ2) assert str(slim_summ) != str(summ)