# -*- coding: utf-8 -*- """ Created on Tue Jun 12 13:18:12 2018 Author: Josef Perktold """ from statsmodels.compat.pandas import testing as pdt import os.path import numpy as np from numpy.testing import assert_allclose import pandas as pd import pytest from statsmodels.regression.linear_model import OLS from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod import families from statsmodels.stats.outliers_influence import MLEInfluence cur_dir = os.path.abspath(os.path.dirname(__file__)) file_name = 'binary_constrict.csv' file_path = os.path.join(cur_dir, 'results', file_name) data_bin = pd.read_csv(file_path, index_col=0) file_name = 'results_influence_logit.csv' file_path = os.path.join(cur_dir, 'results', file_name) results_sas_df = pd.read_csv(file_path, index_col=0) def test_influence_glm_bernoulli(): # example uses Finney's data and is used in Pregibon 1981 df = data_bin results_sas = np.asarray(results_sas_df) res = GLM(df['constrict'], df[['const', 'log_rate', 'log_volumne']], family=families.Binomial()).fit(attach_wls=True, atol=1e-10) infl = res.get_influence(observed=False) k_vars = 3 assert_allclose(infl.dfbetas, results_sas[:, 5:8], atol=1e-4) assert_allclose(infl.d_params, results_sas[:, 5:8] * res.bse.values, atol=1e-4) assert_allclose(infl.cooks_distance[0] * k_vars, results_sas[:, 8], atol=6e-5) assert_allclose(infl.hat_matrix_diag, results_sas[:, 4], atol=6e-5) c_bar = infl.cooks_distance[0] * 3 * (1 - infl.hat_matrix_diag) assert_allclose(c_bar, results_sas[:, 9], atol=6e-5) class InfluenceCompareExact(object): # Mixin to compare and test two Influence instances def test_basics(self): infl1 = self.infl1 infl0 = self.infl0 assert_allclose(infl0.hat_matrix_diag, infl1.hat_matrix_diag, rtol=1e-12) assert_allclose(infl0.resid_studentized, infl1.resid_studentized, rtol=1e-12, atol=1e-7) cd_rtol = getattr(self, 'cd_rtol', 1e-7) assert_allclose(infl0.cooks_distance[0], infl1.cooks_distance[0], rtol=cd_rtol) assert_allclose(infl0.dfbetas, infl1.dfbetas, rtol=1e-9, atol=5e-9) assert_allclose(infl0.d_params, infl1.d_params, rtol=1e-9, atol=5e-9) assert_allclose(infl0.d_fittedvalues, infl1.d_fittedvalues, rtol=5e-9) assert_allclose(infl0.d_fittedvalues_scaled, infl1.d_fittedvalues_scaled, rtol=5e-9) @pytest.mark.smoke @pytest.mark.matplotlib def test_plots(self, close_figures): infl1 = self.infl1 infl0 = self.infl0 fig = infl0.plot_influence(external=False) fig = infl1.plot_influence(external=False) fig = infl0.plot_index('resid', threshold=0.2, title='') fig = infl1.plot_index('resid', threshold=0.2, title='') fig = infl0.plot_index('dfbeta', idx=1, threshold=0.2, title='') fig = infl1.plot_index('dfbeta', idx=1, threshold=0.2, title='') fig = infl0.plot_index('cook', idx=1, threshold=0.2, title='') fig = infl1.plot_index('cook', idx=1, threshold=0.2, title='') fig = infl0.plot_index('hat', idx=1, threshold=0.2, title='') fig = infl1.plot_index('hat', idx=1, threshold=0.2, title='') def test_summary(self): infl1 = self.infl1 infl0 = self.infl0 df0 = infl0.summary_frame() df1 = infl1.summary_frame() assert_allclose(df0.values, df1.values, rtol=5e-5) pdt.assert_index_equal(df0.index, df1.index) def _check_looo(self): infl = self.infl1 # unwrap if needed results = getattr(infl.results, '_results', infl.results) res_looo = infl._res_looo mask_infl = infl.cooks_distance[0] > 2 * infl.cooks_distance[0].std() mask_low = ~mask_infl diff_params = results.params - res_looo['params'] assert_allclose(infl.d_params[mask_low], diff_params[mask_low], atol=0.05) assert_allclose(infl.params_one[mask_low], res_looo['params'][mask_low], rtol=0.01) class TestInfluenceLogitGLMMLE(InfluenceCompareExact): @classmethod def setup_class(cls): df = data_bin res = GLM(df['constrict'], df[['const', 'log_rate', 'log_volumne']], family=families.Binomial()).fit(attach_wls=True, atol=1e-10) cls.infl1 = res.get_influence() cls.infl0 = MLEInfluence(res) def test_looo(self): _check_looo(self) class TestInfluenceBinomialGLMMLE(InfluenceCompareExact): # example based on Williams and R docs @classmethod def setup_class(cls): yi = np.array([0, 2, 14, 19, 30]) ni = 40 * np.ones(len(yi)) xi = np.arange(1, len(yi) + 1) exog = np.column_stack((np.ones(len(yi)), xi)) endog = np.column_stack((yi, ni - yi)) res = GLM(endog, exog, family=families.Binomial()).fit() cls.infl1 = res.get_influence() cls.infl0 = MLEInfluence(res) cls.cd_rtol = 5e-5 def test_looo(self): _check_looo(self) def test_r(self): # values from R, # > xi <- 1:5 # > yi <- c(0,2,14,19,30) # number of mice responding to dose xi # > mi <- rep(40, 5) # number of mice exposed # > glmI <- glm(cbind(yi, mi -yi) ~ xi, family = binomial) # > imI <- influence.measures(glmI) # > t(imI$infmat) # dfbeta/dfbetas and dffits do not make sense to me and are furthe away from # looo than mine # resid seem to be resid_deviance based and not resid_pearson # I did not compare cov.r infl1 = self.infl1 cooks_d = [0.25220202795934726, 0.26107981497746285, 1.28985614424132389, 0.08449722285516942, 0.36362110845918005] hat = [0.2594393406119333, 0.3696442663244837, 0.3535768402250521, 0.389209198535791057, 0.6281303543027403] assert_allclose(infl1.hat_matrix_diag, hat, rtol=5e-6) assert_allclose(infl1.cooks_distance[0], cooks_d, rtol=1e-5) class TestInfluenceGaussianGLMMLE(InfluenceCompareExact): @classmethod def setup_class(cls): from .test_diagnostic import get_duncan_data endog, exog, labels = get_duncan_data() data = pd.DataFrame(np.column_stack((endog, exog)), columns='y const var1 var2'.split(), index=labels) res = GLM.from_formula('y ~ const + var1 + var2 - 1', data).fit() #res = GLM(endog, exog).fit() cls.infl1 = res.get_influence() cls.infl0 = MLEInfluence(res) def test_looo(self): _check_looo(self) class TestInfluenceGaussianGLMOLS(InfluenceCompareExact): @classmethod def setup_class(cls): from .test_diagnostic import get_duncan_data endog, exog, labels = get_duncan_data() data = pd.DataFrame(np.column_stack((endog, exog)), columns='y const var1 var2'.split(), index=labels) res0 = GLM.from_formula('y ~ const + var1 + var2 - 1', data).fit() res1 = OLS.from_formula('y ~ const + var1 + var2 - 1', data).fit() cls.infl1 = res1.get_influence() cls.infl0 = res0.get_influence() def test_basics(self): # needs to override attributes that are not equivalent, # i.e. not available or different definition like external vs internal infl1 = self.infl1 infl0 = self.infl0 assert_allclose(infl0.hat_matrix_diag, infl1.hat_matrix_diag, rtol=1e-12) assert_allclose(infl0.resid_studentized, infl1.resid_studentized, rtol=1e-12, atol=1e-7) assert_allclose(infl0.cooks_distance, infl1.cooks_distance, rtol=1e-7) assert_allclose(infl0.dfbetas, infl1.dfbetas, rtol=0.1) # changed # OLSInfluence only has looo dfbeta/d_params assert_allclose(infl0.d_params, infl1.dfbeta, rtol=1e-9, atol=1e-14) # d_fittedvalues is not available in OLSInfluence, i.e. only scaled dffits # assert_allclose(infl0.d_fittedvalues, infl1.d_fittedvalues, rtol=1e-9) assert_allclose(infl0.d_fittedvalues_scaled, infl1.dffits_internal[0], rtol=1e-9) # specific to linear link assert_allclose(infl0.d_linpred, infl0.d_fittedvalues, rtol=1e-12) assert_allclose(infl0.d_linpred_scaled, infl0.d_fittedvalues_scaled, rtol=1e-12) def test_summary(self): infl1 = self.infl1 infl0 = self.infl0 df0 = infl0.summary_frame() df1 = infl1.summary_frame() # just some basic check on overlap except for dfbetas cols = ['cooks_d', 'standard_resid', 'hat_diag', 'dffits_internal'] assert_allclose(df0[cols].values, df1[cols].values, rtol=1e-5) pdt.assert_index_equal(df0.index, df1.index)