# -*- coding: utf-8 -*- """ Created on Sun Nov 14 08:21:41 2010 Author: josef-pktd License: BSD (3-clause) """ import numpy as np import statsmodels.api as sm from statsmodels.sandbox.tools import pca from statsmodels.sandbox.tools.cross_val import LeaveOneOut #converting example Principal Component Regression to a class #from sandbox/example_pca_regression.py class FactorModelUnivariate(object): ''' Todo: check treatment of const, make it optional ? add hasconst (0 or 1), needed when selecting nfact+hasconst options are arguments in calc_factors, should be more public instead cross-validation is slow for large number of observations ''' def __init__(self, endog, exog): #do this in a superclass? self.endog = np.asarray(endog) self.exog = np.asarray(exog) def calc_factors(self, x=None, keepdim=0, addconst=True): '''get factor decomposition of exogenous variables This uses principal component analysis to obtain the factors. The number of factors kept is the maximum that will be considered in the regression. ''' if x is None: x = self.exog else: x = np.asarray(x) xred, fact, evals, evecs = pca(x, keepdim=keepdim, normalize=1) self.exog_reduced = xred #self.factors = fact if addconst: self.factors = sm.add_constant(fact, prepend=True) self.hasconst = 1 #needs to be int else: self.factors = fact self.hasconst = 0 #needs to be int self.evals = evals self.evecs = evecs def fit_fixed_nfact(self, nfact): if not hasattr(self, 'factors_wconst'): self.calc_factors() return sm.OLS(self.endog, self.factors[:,:nfact+1]).fit() def fit_find_nfact(self, maxfact=None, skip_crossval=True, cv_iter=None): '''estimate the model and selection criteria for up to maxfact factors The selection criteria that are calculated are AIC, BIC, and R2_adj. and additionally cross-validation prediction error sum of squares if `skip_crossval` is false. Cross-validation is not used by default because it can be time consuming to calculate. By default the cross-validation method is Leave-one-out on the full dataset. A different cross-validation sample can be specified as an argument to cv_iter. Results are attached in `results_find_nfact` ''' #print 'OLS on Factors' if not hasattr(self, 'factors'): self.calc_factors() hasconst = self.hasconst if maxfact is None: maxfact = self.factors.shape[1] - hasconst if (maxfact+hasconst) < 1: raise ValueError('nothing to do, number of factors (incl. constant) should ' + 'be at least 1') #temporary safety maxfact = min(maxfact, 10) y0 = self.endog results = [] #xred, fact, eva, eve = pca(x0, keepdim=0, normalize=1) for k in range(1, maxfact+hasconst): #k includes now the constnat #xred, fact, eva, eve = pca(x0, keepdim=k, normalize=1) # this is faster and same result fact = self.factors[:,:k] res = sm.OLS(y0, fact).fit() ## print 'k =', k ## print res.params ## print 'aic: ', res.aic ## print 'bic: ', res.bic ## print 'llf: ', res.llf ## print 'R2 ', res.rsquared ## print 'R2 adj', res.rsquared_adj if not skip_crossval: if cv_iter is None: cv_iter = LeaveOneOut(len(y0)) prederr2 = 0. for inidx, outidx in cv_iter: res_l1o = sm.OLS(y0[inidx], fact[inidx,:]).fit() #print data.endog[outidx], res.model.predict(data.exog[outidx,:]), prederr2 += (y0[outidx] - res_l1o.model.predict(res_l1o.params, fact[outidx,:]))**2. else: prederr2 = np.nan results.append([k, res.aic, res.bic, res.rsquared_adj, prederr2]) self.results_find_nfact = results = np.array(results) self.best_nfact = np.r_[(np.argmin(results[:,1:3],0), np.argmax(results[:,3],0), np.argmin(results[:,-1],0))] def summary_find_nfact(self): '''provides a summary for the selection of the number of factors Returns ------- sumstr : str summary of the results for selecting the number of factors ''' if not hasattr(self, 'results_find_nfact'): self.fit_find_nfact() results = self.results_find_nfact sumstr = '' sumstr += '\n' + 'Best result for k, by AIC, BIC, R2_adj, L1O' # best = np.r_[(np.argmin(results[:,1:3],0), np.argmax(results[:,3],0), # np.argmin(results[:,-1],0))] sumstr += '\n' + ' '*19 + '%5d %4d %6d %5d' % tuple(self.best_nfact) from statsmodels.iolib.table import SimpleTable headers = 'k, AIC, BIC, R2_adj, L1O'.split(', ') numformat = ['%6d'] + ['%10.3f']*4 #'%10.4f' txt_fmt1 = dict(data_fmts = numformat) tabl = SimpleTable(results, headers, None, txt_fmt=txt_fmt1) sumstr += '\n' + "PCA regression on simulated data," sumstr += '\n' + "DGP: 2 factors and 4 explanatory variables" sumstr += '\n' + tabl.__str__() sumstr += '\n' + "Notes: k is number of components of PCA," sumstr += '\n' + " constant is added additionally" sumstr += '\n' + " k=0 means regression on constant only" sumstr += '\n' + " L1O: sum of squared prediction errors for leave-one-out" return sumstr if __name__ == '__main__': examples = [1] if 1 in examples: nobs = 500 f0 = np.c_[np.random.normal(size=(nobs,2)), np.ones((nobs,1))] f2xcoef = np.c_[np.repeat(np.eye(2),2,0),np.arange(4)[::-1]].T f2xcoef = np.array([[ 1., 1., 0., 0.], [ 0., 0., 1., 1.], [ 3., 2., 1., 0.]]) f2xcoef = np.array([[ 0.1, 3., 1., 0.], [ 0., 0., 1.5, 0.1], [ 3., 2., 1., 0.]]) x0 = np.dot(f0, f2xcoef) x0 += 0.1*np.random.normal(size=x0.shape) ytrue = np.dot(f0,[1., 1., 1.]) y0 = ytrue + 0.1*np.random.normal(size=ytrue.shape) mod = FactorModelUnivariate(y0, x0) print(mod.summary_find_nfact()) print("with cross validation - slower") mod.fit_find_nfact(maxfact=None, skip_crossval=False, cv_iter=None) print(mod.summary_find_nfact())