# -*- coding: utf-8 -*- """ unit test for GAM Author: Josef Perktold """ import os import numpy as np from numpy.testing import assert_allclose, assert_equal, assert_ import pandas as pd import pytest import patsy from statsmodels.discrete.discrete_model import Poisson, Logit, Probit from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod.families import family from statsmodels.sandbox.regression.penalized import TheilGLS from statsmodels.base._penalized import PenalizedMixin import statsmodels.base._penalties as smpen from statsmodels.gam.smooth_basis import (BSplines, CyclicCubicSplines) from statsmodels.gam.generalized_additive_model import ( GLMGam, GLMGamResults, GLMGamResultsWrapper) from statsmodels.tools.linalg import matrix_sqrt, transf_constraints from .results import results_pls, results_mpg_bs, results_mpg_bs_poisson class PoissonPenalized(PenalizedMixin, Poisson): pass class LogitPenalized(PenalizedMixin, Logit): pass class ProbitPenalized(PenalizedMixin, Probit): pass class GLMPenalized(PenalizedMixin, GLM): pass cur_dir = os.path.dirname(os.path.abspath(__file__)) file_path = os.path.join(cur_dir, "results", "motorcycle.csv") data_mcycle = pd.read_csv(file_path) file_path = os.path.join(cur_dir, "results", "autos.csv") df_autos_ = pd.read_csv(file_path) df_autos = df_autos_[['city_mpg', 'fuel', 'drive', 'weight', 'hp']].dropna() class CheckGAMMixin(object): @classmethod def _init(cls): # TODO: CyclicCubicSplines raises when using pandas cc_h = CyclicCubicSplines(np.asarray(data_mcycle['times']), df=[6]) constraints = np.atleast_2d(cc_h.basis.mean(0)) transf = transf_constraints(constraints) exog = cc_h.basis.dot(transf) penalty_matrix = transf.T.dot(cc_h.penalty_matrices[0]).dot(transf) restriction = matrix_sqrt(penalty_matrix) return exog, penalty_matrix, restriction def test_params(self): res1 = self.res1 res2 = self.res2 assert_allclose(res1.params, res2.params, rtol=1e-5) assert_allclose(np.asarray(res1.cov_params()), res2.Vp * self.covp_corrfact, rtol=1e-6, atol=1e-9) assert_allclose(res1.scale, res2.scale * self.covp_corrfact, rtol=1e-8) assert_allclose(np.asarray(res1.bse), res2.se * np.sqrt(self.covp_corrfact), rtol=1e-6, atol=1e-9) def test_fitted(self): res1 = self.res1 res2 = self.res2 assert_allclose(res1.fittedvalues, res2.fitted_values, rtol=self.rtol_fitted) @pytest.mark.smoke def test_null_smoke(self): self.res1.llnull class TestTheilPLS5(CheckGAMMixin): cov_type = 'data-prior' @classmethod def setup_class(cls): exog, penalty_matrix, restriction = cls._init() endog = data_mcycle['accel'] modp = TheilGLS(endog, exog, r_matrix=restriction) # scaling of penweith in R mgcv s_scale_r = 0.02630734 # Theil penweight uses preliminary sigma2_e to scale penweight sigma_e = 1405.7950179165323 cls.pw = pw = 1 / sigma_e / s_scale_r cls.res1 = modp.fit(pen_weight=pw, cov_type=cls.cov_type) cls.res2 = results_pls.pls5 cls.rtol_fitted = 1e-7 cls.covp_corrfact = 0.99786932844203202 def test_cov_robust(self): res1 = self.res1 res2 = self.res2 pw = res1.penalization_factor res1 = res1.model.fit(pen_weight=pw, cov_type='sandwich') assert_allclose(np.asarray(res1.cov_params()), res2.Ve * self.covp_corrfact, rtol=1e-4) def test_null_smoke(self): pytest.skip("llnull not available") class TestGLMPenalizedPLS5(CheckGAMMixin): cov_type = 'nonrobust' @classmethod def setup_class(cls): exog, penalty_matrix, restriction = cls._init() endog = data_mcycle['accel'] pen = smpen.L2ConstraintsPenalty(restriction=restriction) mod = GLMPenalized(endog, exog, family=family.Gaussian(), penal=pen) # scaling of penweight in R mgcv s_scale_r = 0.02630734 # set pen_weight to correspond to R mgcv example cls.pw = mod.pen_weight = 1 / s_scale_r / 2 cls.res1 = mod.fit(cov_type=cls.cov_type, method='bfgs', maxiter=100, disp=0, trim=False, scale='x2') cls.res2 = results_pls.pls5 cls.rtol_fitted = 1e-5 # edf is currently not available with PenalizedMixin # need correction for difference in scale denominator cls.covp_corrfact = 1.0025464444310588 def _test_cov_robust(self): # TODO: HC0 differs from Theil sandwich, difference is large res1 = self.res1 res2 = self.res2 pw = res1.model.pen_weight res1 = res1.model.fit(pen_weight=pw, cov_type='HC0') assert_allclose(np.asarray(res1.cov_params()), res2.Ve * self.covp_corrfact, rtol=1e-4) class TestGAM5Pirls(CheckGAMMixin): cov_type = 'nonrobust' @classmethod def setup_class(cls): s_scale = 0.0263073404164214 x = data_mcycle['times'].values endog = data_mcycle['accel'] cc = CyclicCubicSplines(x, df=[6], constraints='center') gam_cc = GLMGam(endog, smoother=cc, alpha=1 / s_scale / 2) cls.res1 = gam_cc.fit() cls.res2 = results_pls.pls5 cls.rtol_fitted = 1e-12 # cls.covp_corrfact = 1.0025464444310588 # without edf # edf is implemented cls.covp_corrfact = 1 class TestGAM5Bfgs(CheckGAMMixin): cov_type = 'nonrobust' @classmethod def setup_class(cls): s_scale = 0.0263073404164214 x = data_mcycle['times'].values endog = data_mcycle['accel'] cc = CyclicCubicSplines(x, df=[6], constraints='center') gam_cc = GLMGam(endog, smoother=cc, alpha=1 / s_scale / 2) cls.res1 = gam_cc.fit(method='bfgs') cls.res2 = results_pls.pls5 cls.rtol_fitted = 1e-5 # cls.covp_corrfact = 1.0025464444310588 # without edf # edf is implemented cls.covp_corrfact = 1 def test_predict(self): res1 = self.res1 res2 = self.res2 predicted = res1.predict(None, res1.model.smoother.x[2:4]) assert_allclose(predicted, res1.fittedvalues[2:4], rtol=1e-13) assert_allclose(predicted, res2.fitted_values[2:4], rtol=self.rtol_fitted) class TestGAM6Pirls(object): @classmethod def setup_class(cls): s_scale = 0.0263073404164214 cc = CyclicCubicSplines(data_mcycle['times'].values, df=[6]) gam_cc = GLMGam(data_mcycle['accel'], smoother=cc, alpha=1 / s_scale / 2) cls.res1 = gam_cc.fit() def test_fitted(self): res1 = self.res1 pred = res1.get_prediction() self.rtol_fitted = 1e-7 pls6_fittedvalues = np.array([ 2.45008146537851, 3.14145063965465, 5.24130119353225, 6.63476330674223, 7.99704341866374, 13.9351103077006, 14.5508371638833, 14.785647621276, 15.1176070735895, 14.8053514054347, 13.790412967255, 13.790412967255, 11.2997845518655, 9.51681958051473, 8.4811626302547]) assert_allclose(res1.fittedvalues[:15], pls6_fittedvalues, rtol=self.rtol_fitted) assert_allclose(pred.predicted_mean[:15], pls6_fittedvalues, rtol=self.rtol_fitted) predicted = res1.predict(None, res1.model.smoother.x[2:4]) assert_allclose(predicted, pls6_fittedvalues[2:4], rtol=self.rtol_fitted) class TestGAM6Bfgs(object): @classmethod def setup_class(cls): s_scale = 0.0263073404164214 cc = CyclicCubicSplines(data_mcycle['times'].values, df=[6]) gam_cc = GLMGam(data_mcycle['accel'], smoother=cc, alpha=1 / s_scale / 2) cls.res1 = gam_cc.fit(method='bfgs') def test_fitted(self): res1 = self.res1 pred = res1.get_prediction() self.rtol_fitted = 1e-5 pls6_fittedvalues = np.array([ 2.45008146537851, 3.14145063965465, 5.24130119353225, 6.63476330674223, 7.99704341866374, 13.9351103077006, 14.5508371638833, 14.785647621276, 15.1176070735895, 14.8053514054347, 13.790412967255, 13.790412967255, 11.2997845518655, 9.51681958051473, 8.4811626302547]) assert_allclose(res1.fittedvalues[:15], pls6_fittedvalues, rtol=self.rtol_fitted) assert_allclose(pred.predicted_mean[:15], pls6_fittedvalues, rtol=self.rtol_fitted) class TestGAM6Bfgs0(object): @classmethod def setup_class(cls): s_scale = 0.0263073404164214 # noqa: F841 cc = CyclicCubicSplines(data_mcycle['times'].values, df=[6]) gam_cc = GLMGam(data_mcycle['accel'], smoother=cc, alpha=0) cls.res1 = gam_cc.fit(method='bfgs') def test_fitted(self): res1 = self.res1 pred = res1.get_prediction() self.rtol_fitted = 1e-5 pls6_fittedvalues = np.array([ 2.63203377595747, 3.41285892739456, 5.78168657308338, 7.35344779586831, 8.89178704316853, 15.7035642157176, 16.4510219628328, 16.7474993878412, 17.3397025587698, 17.1062522298643, 16.1786066072489, 16.1786066072489, 13.7402485937614, 11.9531909618517, 10.9073964111009]) assert_allclose(res1.fittedvalues[:15], pls6_fittedvalues, rtol=self.rtol_fitted) assert_allclose(pred.predicted_mean[:15], pls6_fittedvalues, rtol=self.rtol_fitted) pls6_fittedvalues = np.array([ 2.45008146537851, 3.14145063965465, 5.24130119353225, 6.63476330674223, 7.99704341866374, 13.9351103077006, 14.5508371638833, 14.785647621276, 15.1176070735895, 14.8053514054347, 13.790412967255, 13.790412967255, 11.2997845518655, 9.51681958051473, 8.4811626302547]) pls6_exog = np.array([ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -0.334312615555276, -0.302733562622373, -0.200049479196403, -0.12607681525989, -0.0487229716135211, 0.397628373646056, 0.475396222437879, 0.51311526571058, 0.685638355361239, 0.745083051531164, -0.633518318499726, -0.634362488928233, -0.635472088268483, -0.634802453890957, -0.632796625534419, -0.589886140629009, -0.574834708734556, -0.566315983948608, -0.51289784236512, -0.486061743835595, -0.353449234316442, -0.348107090921062, -0.328814083307981, -0.313617048982477, -0.296913301955505, -0.191949693921079, -0.173001127145111, -0.163813487426548, -0.12229019995063, -0.108463798212062, -0.33613551740577, -0.327911471033406, -0.303620832999443, -0.287786799373968, -0.272279566127816, -0.194325957984873, -0.18175817334823, -0.175688807660186, -0.147654475500976, -0.137597948224942, -0.406564043706154, -0.409594429953082, -0.412391645561287, -0.409453786864986, -0.403086590828732, -0.322579243114146, -0.302545882788086, -0.29221622484174, -0.239207291311699, -0.218194346676734 ]).reshape(10, 6, order='F') class TestGAM6ExogBfgs(object): @classmethod def setup_class(cls): s_scale = 0.0263073404164214 nobs = data_mcycle['times'].shape[0] cc = CyclicCubicSplines(data_mcycle['times'].values, df=[6], constraints='center') gam_cc = GLMGam(data_mcycle['accel'], np.ones(nobs), smoother=cc, alpha=1 / s_scale / 2) cls.res1 = gam_cc.fit(method='bfgs') def test_fitted(self): res1 = self.res1 pred = res1.get_prediction() self.rtol_fitted = 1e-5 assert_allclose(res1.fittedvalues[:15], pls6_fittedvalues, rtol=self.rtol_fitted) assert_allclose(pred.predicted_mean[:15], pls6_fittedvalues, rtol=self.rtol_fitted) def test_exog(self): exog = self.res1.model.exog assert_allclose(exog[:10], pls6_exog, rtol=1e-13) class TestGAM6ExogPirls(object): @classmethod def setup_class(cls): s_scale = 0.0263073404164214 nobs = data_mcycle['times'].shape[0] cc = CyclicCubicSplines(data_mcycle['times'].values, df=[6], constraints='center') gam_cc = GLMGam(data_mcycle['accel'], np.ones((nobs, 1)), smoother=cc, alpha=1 / s_scale / 2) cls.res1 = gam_cc.fit(method='pirls') def test_fitted(self): res1 = self.res1 pred = res1.get_prediction() self.rtol_fitted = 1e-5 assert_allclose(res1.fittedvalues[:15], pls6_fittedvalues, rtol=self.rtol_fitted) assert_allclose(pred.predicted_mean[:15], pls6_fittedvalues, rtol=self.rtol_fitted) def test_exog(self): exog = self.res1.model.exog assert_allclose(exog[:10], pls6_exog, rtol=1e-13) class TestGAMMPG(object): @classmethod def setup_class(cls): sp = np.array([6.46225497484073, 0.81532465890585]) s_scale = np.array([2.95973613706629e-07, 0.000126203730141359]) x_spline = df_autos[['weight', 'hp']].values exog = patsy.dmatrix('fuel + drive', data=df_autos) cc = CyclicCubicSplines(x_spline, df=[6, 5], constraints='center') # TODO alpha needs to be list gam_cc = GLMGam(df_autos['city_mpg'], exog=exog, smoother=cc, alpha=(1 / s_scale * sp / 2).tolist()) cls.res1a = gam_cc.fit() gam_cc = GLMGam(df_autos['city_mpg'], exog=exog, smoother=cc, alpha=(1 / s_scale * sp / 2).tolist()) cls.res1b = gam_cc.fit(method='newton') def test_exog(self): file_path = os.path.join(cur_dir, "results", "autos_exog.csv") df_exog = pd.read_csv(file_path) res2_exog = df_exog.values for res1 in [self.res1a, self.res1b]: exog = res1.model.exog # exog contains zeros assert_allclose(exog, res2_exog, atol=1e-14) def test_fitted(self): file_path = os.path.join(cur_dir, "results", "autos_predict.csv") df_pred = pd.read_csv(file_path, index_col="Row.names") df_pred.index = df_pred.index - 1 res2_fittedvalues = df_pred["fit"].values res2_se_mean = df_pred["se_fit"].values for res1 in [self.res1a, self.res1b]: pred = res1.get_prediction() self.rtol_fitted = 1e-5 assert_allclose(res1.fittedvalues, res2_fittedvalues, rtol=1e-10) assert_allclose(pred.predicted_mean, res2_fittedvalues, rtol=1e-10) # TODO: no edf, edf corrected df_resid # scale estimate differs # corr_fact = np.sqrt(191.669417019567 / 190) # without edf # edf is implemented corr_fact = 1 assert_allclose(pred.se_mean, res2_se_mean * corr_fact, rtol=1e-10) class TestGAMMPGBS(CheckGAMMixin): # This has matching results from mgcv @classmethod def setup_class(cls): sp = np.array([0.830689464223685, 425.361212061649]) cls.s_scale = s_scale = np.array([2.443955e-06, 0.007945455]) x_spline = df_autos[['weight', 'hp']].values # We need asarray to remove the design_info # If design_info is attached, # then exog_linear will also be transformed in predict. cls.exog = np.asarray(patsy.dmatrix('fuel + drive', data=df_autos)) bs = BSplines(x_spline, df=[12, 10], degree=[3, 3], variable_names=['weight', 'hp'], constraints='center', include_intercept=True) # TODO alpha needs to be list alpha0 = 1 / s_scale * sp / 2 gam_bs = GLMGam(df_autos['city_mpg'], exog=cls.exog, smoother=bs, alpha=(alpha0).tolist()) cls.res1a = gam_bs.fit(use_t=True) cls.res1b = gam_bs.fit(method='newton', use_t=True) cls.res1 = cls.res1a._results cls.res2 = results_mpg_bs.mpg_bs cls.rtol_fitted = 1e-8 cls.covp_corrfact = 1 # not needed # for checking that alpha model attribute is unchanged, same as alpha0 cls.alpha = [169947.78222669504, 26767.58046340008] @classmethod def _init(cls): pass def test_edf(self): res1 = self.res1 res2 = self.res2 assert_allclose(res1.edf, res2.edf_all, rtol=1e-6) hat = res1.get_hat_matrix_diag() assert_allclose(hat, res2.hat, rtol=1e-6) def test_smooth(self): res1 = self.res1 res2 = self.res2 smoothers = res1.model.smoother.smoothers pen_matrix0 = smoothers[0].cov_der2 assert_allclose(pen_matrix0, res2.smooth0.S * res2.smooth0.S_scale, rtol=1e-6) def test_predict(self): res1 = self.res1 res2 = self.res2 predicted = res1.predict(self.exog[2:4], res1.model.smoother.x[2:4]) assert_allclose(predicted, res1.fittedvalues[2:4], rtol=1e-13) assert_allclose(predicted, res2.fitted_values[2:4], rtol=self.rtol_fitted) def test_crossval(self): # includes some checks that penalization in the model is unchanged mod = self.res1.model assert_equal(mod.alpha, self.alpha) # assert unchanged assert_allclose(self.res1.scale, 4.7064821354391118, rtol=1e-13) alpha_aic = mod.select_penweight()[0] # regression number, but in the right ball park assert_allclose(alpha_aic, [112487.81362014, 129.89155677], rtol=1e-3) assert_equal(mod.alpha, self.alpha) # assert unchanged assert_equal(mod.penal.start_idx, 4) pm = mod.penal.penalty_matrix() assert_equal(pm[:, :4], 0) assert_equal(pm[:4, :], 0) assert_allclose(self.res1.scale, 4.7064821354391118, rtol=1e-13) np.random.seed(987125) alpha_cv, _ = mod.select_penweight_kfold(k_folds=3, k_grid=6) # regression number, but in the right ball park assert_allclose(alpha_cv, [10000000.0, 630.957344480193], rtol=1e-5) assert_equal(mod.alpha, self.alpha) # assert unchanged assert_equal(mod.penal.start_idx, 4) pm = mod.penal.penalty_matrix() assert_equal(pm[:, :4], 0) assert_equal(pm[:4, :], 0) assert_allclose(self.res1.scale, 4.7064821354391118, rtol=1e-13) class TestGAMMPGBSPoisson(CheckGAMMixin): # This has matching results from mgcv @classmethod def setup_class(cls): sp = np.array([40491.3940640059, 232455.530262537]) # s_scale is same as before cls.s_scale = s_scale = np.array([2.443955e-06, 0.007945455]) x_spline = df_autos[['weight', 'hp']].values cls.exog = patsy.dmatrix('fuel + drive', data=df_autos) bs = BSplines(x_spline, df=[12, 10], degree=[3, 3], variable_names=['weight', 'hp'], constraints='center', include_intercept=True) # TODO alpha needs to be list alpha0 = 1 / s_scale * sp / 2 gam_bs = GLMGam(df_autos['city_mpg'], exog=cls.exog, smoother=bs, family=family.Poisson(), alpha=alpha0) xnames = cls.exog.design_info.column_names + gam_bs.smoother.col_names gam_bs.exog_names[:] = xnames cls.res1a = gam_bs.fit(use_t=False) cls.res1b = gam_bs.fit(method='newton', use_t=True) cls.res1 = cls.res1a._results cls.res2 = results_mpg_bs_poisson.mpg_bs_poisson cls.rtol_fitted = 1e-8 cls.covp_corrfact = 1 # not needed @classmethod def _init(cls): pass def test_edf(self): res1 = self.res1 res2 = self.res2 assert_allclose(res1.edf, res2.edf_all, rtol=1e-6) hat = res1.get_hat_matrix_diag() assert_allclose(hat, res2.hat, rtol=1e-6) assert_allclose(res1.aic, res2.aic, rtol=1e-8) assert_allclose(res1.deviance, res2.deviance, rtol=1e-8) assert_allclose(res1.df_resid, res2.residual_df, rtol=1e-8) def test_smooth(self): res1 = self.res1 res2 = self.res2 smoothers = res1.model.smoother.smoothers pen_matrix0 = smoothers[0].cov_der2 assert_allclose(pen_matrix0, res2.smooth0.S * res2.smooth0.S_scale, rtol=1e-6) def test_predict(self): res1 = self.res1 res2 = self.res2 # this uses transform also for exog_linear # predicted = res1.predict(self.exog[2:4], res1.model.smoother.x[2:4]) predicted = res1.predict(df_autos.iloc[2:4], res1.model.smoother.x[2:4]) assert_allclose(predicted, res1.fittedvalues[2:4], rtol=1e-13) assert_allclose(predicted, res2.fitted_values[2:4], rtol=self.rtol_fitted) # linpred = res1.predict(self.exog[2:4], res1.model.smoother.x[2:4], # linear=True) xp = pd.DataFrame(res1.model.smoother.x[2:4]) linpred = res1.predict(df_autos.iloc[2:4], xp, linear=True) assert_allclose(linpred, res2.linear_predictors[2:4], rtol=self.rtol_fitted) assert_equal(predicted.index.values, [2, 3]) assert_equal(linpred.index.values, [2, 3]) def test_wald(self): res1 = self.res1 res2 = self.res2 wtt = res1.wald_test_terms(skip_single=True, combine_terms=['fuel', 'drive', 'weight', 'hp'], scalar=True) # mgcv has term test for linear part assert_allclose(wtt.statistic[:2], res2.pTerms_chi_sq, rtol=1e-7) assert_allclose(wtt.pvalues[:2], res2.pTerms_pv, rtol=1e-6) assert_equal(wtt.df_constraints[:2], res2.pTerms_df) def test_select_alpha(self): res1 = self.res1 alpha_mgcv = res1.model.alpha res_s = res1.model.select_penweight() assert_allclose(res_s[0], alpha_mgcv, rtol=5e-5) class TestGAMMPGBSPoissonFormula(TestGAMMPGBSPoisson): # This is the same as the previous but with from_formula @classmethod def setup_class(cls): sp = np.array([40491.3940640059, 232455.530262537]) # s_scale is same as before cls.s_scale = s_scale = np.array([2.443955e-06, 0.007945455]) cls.exog = patsy.dmatrix('fuel + drive', data=df_autos) x_spline = df_autos[['weight', 'hp']].values bs = BSplines(x_spline, df=[12, 10], degree=[3, 3], variable_names=['weight', 'hp'], constraints='center', include_intercept=True) alpha0 = 1 / s_scale * sp / 2 gam_bs = GLMGam.from_formula('city_mpg ~ fuel + drive', df_autos, smoother=bs, family=family.Poisson(), alpha=alpha0) cls.res1a = gam_bs.fit(use_t=False) cls.res1b = gam_bs.fit(method='newton', use_t=True) cls.res1 = cls.res1a._results cls.res2 = results_mpg_bs_poisson.mpg_bs_poisson cls.rtol_fitted = 1e-8 cls.covp_corrfact = 1 # not needed def test_names_wrapper(self): res1a = self.res1a xnames = ['Intercept', 'fuel[T.gas]', 'drive[T.fwd]', 'drive[T.rwd]', 'weight_s0', 'weight_s1', 'weight_s2', 'weight_s3', 'weight_s4', 'weight_s5', 'weight_s6', 'weight_s7', 'weight_s8', 'weight_s9', 'weight_s10', 'hp_s0', 'hp_s1', 'hp_s2', 'hp_s3', 'hp_s4', 'hp_s5', 'hp_s6', 'hp_s7', 'hp_s8'] assert_equal(res1a.model.exog_names, xnames) assert_equal(res1a.model.design_info_linear.column_names, xnames[:4]) assert_equal(res1a.fittedvalues.iloc[2:4].index.values, [2, 3]) assert_equal(res1a.params.index.values, xnames) assert_(isinstance(res1a.params, pd.Series)) assert_(isinstance(res1a, GLMGamResultsWrapper)) assert_(isinstance(res1a._results, GLMGamResults))