# -*- coding: utf-8 -*- """Tests for finding a positive semi-definite correlation or covariance matrix Created on Mon May 27 12:07:02 2013 Author: Josef Perktold """ import warnings import numpy as np from numpy.testing import assert_almost_equal, assert_allclose import scipy.sparse as sparse import pytest from statsmodels.stats.correlation_tools import ( corr_nearest, corr_clipped, cov_nearest, _project_correlation_factors, corr_nearest_factor, _spg_optim, corr_thresholded, cov_nearest_factor_homog, FactoredPSDMatrix) from statsmodels.tools.testing import Holder def norm_f(x, y): '''Frobenious norm (squared sum) of difference between two arrays ''' d = ((x - y)**2).sum() return np.sqrt(d) # R library Matrix results cov1_r = Holder() #> nc <- nearPD(pr, conv.tol = 1e-7, keepDiag = TRUE, doDykstra =FALSE, corr=TRUE) #> cat_items(nc, prefix="cov1_r.") cov1_r.mat = '''''' cov1_r.eigenvalues = np.array([ 4.197315628646795, 0.7540460243978023, 0.5077608149667492, 0.3801267599652769, 0.1607508970775889, 4.197315628646795e-08 ]) cov1_r.corr = '''TRUE''' cov1_r.normF = 0.0743805226512533 cov1_r.iterations = 11 cov1_r.rel_tol = 8.288594638441735e-08 cov1_r.converged = '''TRUE''' #> mkarray2(as.matrix(nc$mat), name="cov1_r.mat") cov1_r.mat = np.array([ 1, 0.487968018215892, 0.642651880010906, 0.4906386709070835, 0.6440990530811909, 0.8087111845493985, 0.487968018215892, 1, 0.5141147294352735, 0.2506688108312097, 0.672351311297074, 0.725832055882795, 0.642651880010906, 0.5141147294352735, 1, 0.596827778712154, 0.5821917790519067, 0.7449631633814129, 0.4906386709070835, 0.2506688108312097, 0.596827778712154, 1, 0.729882058012399, 0.772150225146826, 0.6440990530811909, 0.672351311297074, 0.5821917790519067, 0.729882058012399, 1, 0.813191720191944, 0.8087111845493985, 0.725832055882795, 0.7449631633814129, 0.772150225146826, 0.813191720191944, 1 ]).reshape(6,6, order='F') cov_r = Holder() #nc <- nearPD(pr+0.01*diag(6), conv.tol = 1e-7, keepDiag = TRUE, doDykstra =FALSE, corr=FALSE) #> cat_items(nc, prefix="cov_r.") #cov_r.mat = '''''' cov_r.eigenvalues = np.array([ 4.209897516692652, 0.7668341923072066, 0.518956980021938, 0.390838551407132, 0.1734728460460068, 4.209897516692652e-08 ]) cov_r.corr = '''FALSE''' cov_r.normF = 0.0623948693159157 cov_r.iterations = 11 cov_r.rel_tol = 5.83987595937896e-08 cov_r.converged = '''TRUE''' #> mkarray2(as.matrix(nc$mat), name="cov_r.mat") cov_r.mat = np.array([ 1.01, 0.486207476951913, 0.6428524769306785, 0.4886092840296514, 0.645175579158233, 0.811533860074678, 0.486207476951913, 1.01, 0.514394615153752, 0.2478398278204047, 0.673852495852274, 0.7297661648968664, 0.6428524769306785, 0.514394615153752, 1.01, 0.5971503271420517, 0.582018469844712, 0.7445177382760834, 0.4886092840296514, 0.2478398278204047, 0.5971503271420517, 1.01, 0.73161232298669, 0.7766852947049376, 0.645175579158233, 0.673852495852274, 0.582018469844712, 0.73161232298669, 1.01, 0.8107916469252828, 0.811533860074678, 0.7297661648968664, 0.7445177382760834, 0.7766852947049376, 0.8107916469252828, 1.01 ]).reshape(6,6, order='F') def test_corr_psd(): # test positive definite matrix is unchanged x = np.array([[1, -0.2, -0.9], [-0.2, 1, -0.2], [-0.9, -0.2, 1]]) y = corr_nearest(x, n_fact=100) #print np.max(np.abs(x - y)) assert_almost_equal(x, y, decimal=14) y = corr_clipped(x) assert_almost_equal(x, y, decimal=14) y = cov_nearest(x, n_fact=100) assert_almost_equal(x, y, decimal=14) x2 = x + 0.001 * np.eye(3) y = cov_nearest(x2, n_fact=100) assert_almost_equal(x2, y, decimal=14) class CheckCorrPSDMixin(object): def test_nearest(self): x = self.x res_r = self.res y = corr_nearest(x, threshold=1e-7, n_fact=100) #print np.max(np.abs(x - y)) assert_almost_equal(y, res_r.mat, decimal=3) d = norm_f(x, y) assert_allclose(d, res_r.normF, rtol=0.0015) evals = np.linalg.eigvalsh(y) #print 'evals', evals / res_r.eigenvalues[::-1] - 1 assert_allclose(evals, res_r.eigenvalues[::-1], rtol=0.003, atol=1e-7) #print evals[0] / 1e-7 - 1 assert_allclose(evals[0], 1e-7, rtol=1e-6) def test_clipped(self): x = self.x res_r = self.res y = corr_clipped(x, threshold=1e-7) #print np.max(np.abs(x - y)), np.max(np.abs((x - y) / y)) assert_almost_equal(y, res_r.mat, decimal=1) d = norm_f(x, y) assert_allclose(d, res_r.normF, rtol=0.15) evals = np.linalg.eigvalsh(y) assert_allclose(evals, res_r.eigenvalues[::-1], rtol=0.1, atol=1e-7) assert_allclose(evals[0], 1e-7, rtol=0.02) def test_cov_nearest(self): x = self.x res_r = self.res with warnings.catch_warnings(): warnings.simplefilter("ignore") y = cov_nearest(x, method='nearest', threshold=1e-7) #print np.max(np.abs(x - y)) assert_almost_equal(y, res_r.mat, decimal=2) d = norm_f(x, y) assert_allclose(d, res_r.normF, rtol=0.0015) class TestCovPSD(object): @classmethod def setup_class(cls): x = np.array([ 1, 0.477, 0.644, 0.478, 0.651, 0.826, 0.477, 1, 0.516, 0.233, 0.682, 0.75, 0.644, 0.516, 1, 0.599, 0.581, 0.742, 0.478, 0.233, 0.599, 1, 0.741, 0.8, 0.651, 0.682, 0.581, 0.741, 1, 0.798, 0.826, 0.75, 0.742, 0.8, 0.798, 1]).reshape(6,6) cls.x = x + 0.01 * np.eye(6) cls.res = cov_r def test_cov_nearest(self): x = self.x res_r = self.res y = cov_nearest(x, method='nearest') #print np.max(np.abs(x - y)) assert_almost_equal(y, res_r.mat, decimal=3) d = norm_f(x, y) assert_allclose(d, res_r.normF, rtol=0.001) y = cov_nearest(x, method='clipped') #print np.max(np.abs(x - y)) assert_almost_equal(y, res_r.mat, decimal=2) d = norm_f(x, y) assert_allclose(d, res_r.normF, rtol=0.15) class TestCorrPSD1(CheckCorrPSDMixin): @classmethod def setup_class(cls): x = np.array([ 1, 0.477, 0.644, 0.478, 0.651, 0.826, 0.477, 1, 0.516, 0.233, 0.682, 0.75, 0.644, 0.516, 1, 0.599, 0.581, 0.742, 0.478, 0.233, 0.599, 1, 0.741, 0.8, 0.651, 0.682, 0.581, 0.741, 1, 0.798, 0.826, 0.75, 0.742, 0.8, 0.798, 1]).reshape(6,6) cls.x = x cls.res = cov1_r @pytest.mark.parametrize('threshold', [0, 1e-15, 1e-10, 1e-6]) def test_corrpsd_threshold(threshold): x = np.array([[1, -0.9, -0.9], [-0.9, 1, -0.9], [-0.9, -0.9, 1]]) y = corr_nearest(x, n_fact=100, threshold=threshold) evals = np.linalg.eigvalsh(y) assert_allclose(evals[0], threshold, rtol=1e-6, atol=1e-15) y = corr_clipped(x, threshold=threshold) evals = np.linalg.eigvalsh(y) assert_allclose(evals[0], threshold, rtol=0.25, atol=1e-15) y = cov_nearest(x, method='nearest', n_fact=100, threshold=threshold) evals = np.linalg.eigvalsh(y) assert_allclose(evals[0], threshold, rtol=1e-6, atol=1e-15) y = cov_nearest(x, n_fact=100, threshold=threshold) evals = np.linalg.eigvalsh(y) assert_allclose(evals[0], threshold, rtol=0.25, atol=1e-15) class Test_Factor(object): def test_corr_nearest_factor_arrpack(self): # regression results for svds call u2 = np.array([[ 6.39407581e-19, 9.15225947e-03, 1.82631698e-02, 2.72917181e-02, 3.61975557e-02, 4.49413101e-02, 5.34848732e-02, 6.17916613e-02, 6.98268388e-02, 7.75575058e-02, 8.49528448e-02, 9.19842264e-02, 9.86252769e-02, 1.04851906e-01, 1.10642305e-01, 1.15976906e-01, 1.20838331e-01, 1.25211306e-01, 1.29082570e-01, 1.32440778e-01, 1.35276397e-01, 1.37581605e-01, 1.39350201e-01, 1.40577526e-01, 1.41260396e-01, 1.41397057e-01, 1.40987160e-01, 1.40031756e-01, 1.38533306e-01, 1.36495727e-01, 1.33924439e-01, 1.30826443e-01, 1.27210404e-01, 1.23086750e-01, 1.18467769e-01, 1.13367717e-01, 1.07802909e-01, 1.01791811e-01, 9.53551023e-02, 8.85157320e-02, 8.12989329e-02, 7.37322125e-02, 6.58453049e-02, 5.76700847e-02, 4.92404406e-02, 4.05921079e-02, 3.17624629e-02, 2.27902803e-02, 1.37154584e-02, 4.57871801e-03, -4.57871801e-03, -1.37154584e-02, -2.27902803e-02, -3.17624629e-02, -4.05921079e-02, -4.92404406e-02, -5.76700847e-02, -6.58453049e-02, -7.37322125e-02, -8.12989329e-02, -8.85157320e-02, -9.53551023e-02, -1.01791811e-01, -1.07802909e-01, -1.13367717e-01, -1.18467769e-01, -1.23086750e-01, -1.27210404e-01, -1.30826443e-01, -1.33924439e-01, -1.36495727e-01, -1.38533306e-01, -1.40031756e-01, -1.40987160e-01, -1.41397057e-01, -1.41260396e-01, -1.40577526e-01, -1.39350201e-01, -1.37581605e-01, -1.35276397e-01, -1.32440778e-01, -1.29082570e-01, -1.25211306e-01, -1.20838331e-01, -1.15976906e-01, -1.10642305e-01, -1.04851906e-01, -9.86252769e-02, -9.19842264e-02, -8.49528448e-02, -7.75575058e-02, -6.98268388e-02, -6.17916613e-02, -5.34848732e-02, -4.49413101e-02, -3.61975557e-02, -2.72917181e-02, -1.82631698e-02, -9.15225947e-03, -3.51829569e-17]]).T s2 = np.array([ 24.88812183]) d = 100 dm = 1 # Construct a test matrix with exact factor structure X = np.zeros((d,dm), dtype=np.float64) x = np.linspace(0, 2*np.pi, d) for j in range(dm): X[:,j] = np.sin(x*(j+1)) _project_correlation_factors(X) X *= 0.7 mat = np.dot(X, X.T) np.fill_diagonal(mat, 1.) from scipy.sparse.linalg import svds u, s, vt = svds(mat, dm) #difference in sign dsign = np.sign(u[1]) * np.sign(u2[1]) assert_allclose(u, dsign * u2, rtol=1e-6, atol=1e-14) assert_allclose(s, s2, rtol=1e-6) @pytest.mark.parametrize('dm', [1, 2]) def test_corr_nearest_factor(self, dm): objvals = [np.array([6241.8, 6241.8, 579.4, 264.6, 264.3]), np.array([2104.9, 2104.9, 710.5, 266.3, 286.1])] d = 100 # Construct a test matrix with exact factor structure X = np.zeros((d, dm), dtype=np.float64) x = np.linspace(0, 2 * np.pi, d) np.random.seed(10) for j in range(dm): X[:, j] = np.sin(x * (j + 1)) + 1e-10 * np.random.randn(d) _project_correlation_factors(X) assert np.isfinite(X).all() X *= 0.7 mat = np.dot(X, X.T) np.fill_diagonal(mat, 1.) # Try to recover the structure rslt = corr_nearest_factor(mat, dm, maxiter=10000) err_msg = 'rank=%d, niter=%d' % (dm, len(rslt.objective_values)) assert_allclose(rslt.objective_values[:5], objvals[dm - 1], rtol=0.5, err_msg=err_msg) assert rslt.Converged mat1 = rslt.corr.to_matrix() assert_allclose(mat, mat1, rtol=0.25, atol=1e-3, err_msg=err_msg) @pytest.mark.slow @pytest.mark.parametrize('dm', [1, 2]) def test_corr_nearest_factor_sparse(self, dm): # Test that result is the same if the input is dense or sparse d = 200 # Generate a test matrix of factors X = np.zeros((d, dm), dtype=np.float64) x = np.linspace(0, 2 * np.pi, d) rs = np.random.RandomState(10) for j in range(dm): X[:, j] = np.sin(x * (j + 1)) + rs.randn(d) # Get the correlation matrix _project_correlation_factors(X) X *= 0.7 mat = np.dot(X, X.T) np.fill_diagonal(mat, 1) # Threshold it mat.flat[np.abs(mat.flat) < 0.35] = 0.0 smat = sparse.csr_matrix(mat) dense_rslt = corr_nearest_factor(mat, dm, maxiter=10000) sparse_rslt = corr_nearest_factor(smat, dm, maxiter=10000) mat_dense = dense_rslt.corr.to_matrix() mat_sparse = sparse_rslt.corr.to_matrix() assert dense_rslt.Converged is sparse_rslt.Converged assert dense_rslt.Converged is True assert_allclose(mat_dense, mat_sparse, rtol=.25, atol=1e-3) # Test on a quadratic function. def test_spg_optim(self, reset_randomstate): dm = 100 ind = np.arange(dm) indmat = np.abs(ind[:,None] - ind[None,:]) M = 0.8**indmat def obj(x): return np.dot(x, np.dot(M, x)) def grad(x): return 2*np.dot(M, x) def project(x): return x x = np.random.normal(size=dm) rslt = _spg_optim(obj, grad, x, project) xnew = rslt.params assert rslt.Converged is True assert_almost_equal(obj(xnew), 0, decimal=3) def test_decorrelate(self, reset_randomstate): d = 30 dg = np.linspace(1, 2, d) root = np.random.normal(size=(d, 4)) fac = FactoredPSDMatrix(dg, root) mat = fac.to_matrix() rmat = np.linalg.cholesky(mat) dcr = fac.decorrelate(rmat) idm = np.dot(dcr, dcr.T) assert_almost_equal(idm, np.eye(d)) rhs = np.random.normal(size=(d, 5)) mat2 = np.dot(rhs.T, np.linalg.solve(mat, rhs)) mat3 = fac.decorrelate(rhs) mat3 = np.dot(mat3.T, mat3) assert_almost_equal(mat2, mat3) def test_logdet(self, reset_randomstate): d = 30 dg = np.linspace(1, 2, d) root = np.random.normal(size=(d, 4)) fac = FactoredPSDMatrix(dg, root) mat = fac.to_matrix() _, ld = np.linalg.slogdet(mat) ld2 = fac.logdet() assert_almost_equal(ld, ld2) def test_solve(self, reset_randomstate): d = 30 dg = np.linspace(1, 2, d) root = np.random.normal(size=(d, 2)) fac = FactoredPSDMatrix(dg, root) rhs = np.random.normal(size=(d, 5)) sr1 = fac.solve(rhs) mat = fac.to_matrix() sr2 = np.linalg.solve(mat, rhs) assert_almost_equal(sr1, sr2) @pytest.mark.parametrize('dm', [1, 2]) def test_cov_nearest_factor_homog(self, dm): d = 100 # Construct a test matrix with exact factor structure X = np.zeros((d, dm), dtype=np.float64) x = np.linspace(0, 2*np.pi, d) for j in range(dm): X[:, j] = np.sin(x*(j+1)) mat = np.dot(X, X.T) np.fill_diagonal(mat, np.diag(mat) + 3.1) # Try to recover the structure rslt = cov_nearest_factor_homog(mat, dm) mat1 = rslt.to_matrix() assert_allclose(mat, mat1, rtol=0.25, atol=1e-3) @pytest.mark.parametrize('dm', [1, 2]) def test_cov_nearest_factor_homog_sparse(self, dm): # Check that dense and sparse inputs give the same result d = 100 # Construct a test matrix with exact factor structure X = np.zeros((d, dm), dtype=np.float64) x = np.linspace(0, 2*np.pi, d) for j in range(dm): X[:, j] = np.sin(x*(j+1)) mat = np.dot(X, X.T) np.fill_diagonal(mat, np.diag(mat) + 3.1) # Fit to dense rslt = cov_nearest_factor_homog(mat, dm) mat1 = rslt.to_matrix() # Fit to sparse smat = sparse.csr_matrix(mat) rslt = cov_nearest_factor_homog(smat, dm) mat2 = rslt.to_matrix() assert_allclose(mat1, mat2, rtol=0.25, atol=1e-3) def test_corr_thresholded(self, reset_randomstate): import datetime t1 = datetime.datetime.now() X = np.random.normal(size=(2000,10)) tcor = corr_thresholded(X, 0.2, max_elt=4e6) t2 = datetime.datetime.now() ss = (t2-t1).seconds fcor = np.corrcoef(X) fcor *= (np.abs(fcor) >= 0.2) assert_allclose(tcor.todense(), fcor, rtol=0.25, atol=1e-3)