""" Empirical likelihood inference on descriptive statistics This module conducts hypothesis tests and constructs confidence intervals for the mean, variance, skewness, kurtosis and correlation. If matplotlib is installed, this module can also generate multivariate confidence region plots as well as mean-variance contour plots. See _OptFuncts docstring for technical details and optimization variable definitions. General References: ------------------ Owen, A. (2001). "Empirical Likelihood." Chapman and Hall """ import numpy as np from scipy import optimize from scipy.stats import chi2, skew, kurtosis from statsmodels.base.optimizer import _fit_newton import itertools from statsmodels.graphics import utils def DescStat(endog): """ Returns an instance to conduct inference on descriptive statistics via empirical likelihood. See DescStatUV and DescStatMV for more information. Parameters ---------- endog : ndarray Array of data Returns : DescStat instance If k=1, the function returns a univariate instance, DescStatUV. If k>1, the function returns a multivariate instance, DescStatMV. """ if endog.ndim == 1: endog = endog.reshape(len(endog), 1) if endog.shape[1] == 1: return DescStatUV(endog) if endog.shape[1] > 1: return DescStatMV(endog) class _OptFuncts(object): """ A class that holds functions that are optimized/solved. The general setup of the class is simple. Any method that starts with _opt_ creates a vector of estimating equations named est_vect such that np.dot(p, (est_vect))=0 where p is the weight on each observation as a 1 x n array and est_vect is n x k. Then _modif_Newton is called to determine the optimal p by solving for the Lagrange multiplier (eta) in the profile likelihood maximization problem. In the presence of nuisance parameters, _opt_ functions are optimized over to profile out the nuisance parameters. Any method starting with _ci_limits calculates the log likelihood ratio for a specific value of a parameter and then subtracts a pre-specified critical value. This is solved so that llr - crit = 0. """ def __init__(self, endog): pass def _log_star(self, eta, est_vect, weights, nobs): """ Transforms the log of observation probabilities in terms of the Lagrange multiplier to the log 'star' of the probabilities. Parameters ---------- eta : float Lagrange multiplier est_vect : ndarray (n,k) Estimating equations vector wts : nx1 array Observation weights Returns ------ data_star : ndarray The weighted logstar of the estimting equations Notes ----- This function is only a placeholder for the _fit_Newton. The function value is not used in optimization and the optimal value is disregarded when computing the log likelihood ratio. """ data_star = np.log(weights) + (np.sum(weights) +\ np.dot(est_vect, eta)) idx = data_star < 1. / nobs not_idx = ~idx nx = nobs * data_star[idx] data_star[idx] = np.log(1. / nobs) - 1.5 + nx * (2. - nx / 2) data_star[not_idx] = np.log(data_star[not_idx]) return data_star def _hess(self, eta, est_vect, weights, nobs): """ Calculates the hessian of a weighted empirical likelihood problem. Parameters ---------- eta : ndarray, (1,m) Lagrange multiplier in the profile likelihood maximization est_vect : ndarray (n,k) Estimating equations vector weights : 1darray Observation weights Returns ------- hess : m x m array Weighted hessian used in _wtd_modif_newton """ #eta = np.squeeze(eta) data_star_doub_prime = np.sum(weights) + np.dot(est_vect, eta) idx = data_star_doub_prime < 1. / nobs not_idx = ~idx data_star_doub_prime[idx] = - nobs ** 2 data_star_doub_prime[not_idx] = - (data_star_doub_prime[not_idx]) ** -2 wtd_dsdp = weights * data_star_doub_prime return np.dot(est_vect.T, wtd_dsdp[:, None] * est_vect) def _grad(self, eta, est_vect, weights, nobs): """ Calculates the gradient of a weighted empirical likelihood problem Parameters ---------- eta : ndarray, (1,m) Lagrange multiplier in the profile likelihood maximization est_vect : ndarray, (n,k) Estimating equations vector weights : 1darray Observation weights Returns ------- gradient : ndarray (m,1) The gradient used in _wtd_modif_newton """ #eta = np.squeeze(eta) data_star_prime = np.sum(weights) + np.dot(est_vect, eta) idx = data_star_prime < 1. / nobs not_idx = ~idx data_star_prime[idx] = nobs * (2 - nobs * data_star_prime[idx]) data_star_prime[not_idx] = 1. / data_star_prime[not_idx] return np.dot(weights * data_star_prime, est_vect) def _modif_newton(self, eta, est_vect, weights): """ Modified Newton's method for maximizing the log 'star' equation. This function calls _fit_newton to find the optimal values of eta. Parameters ---------- eta : ndarray, (1,m) Lagrange multiplier in the profile likelihood maximization est_vect : ndarray, (n,k) Estimating equations vector weights : 1darray Observation weights Returns ------- params : 1xm array Lagrange multiplier that maximizes the log-likelihood """ nobs = len(est_vect) f = lambda x0: - np.sum(self._log_star(x0, est_vect, weights, nobs)) grad = lambda x0: - self._grad(x0, est_vect, weights, nobs) hess = lambda x0: - self._hess(x0, est_vect, weights, nobs) kwds = {'tol': 1e-8} eta = eta.squeeze() res = _fit_newton(f, grad, eta, (), kwds, hess=hess, maxiter=50, \ disp=0) return res[0] def _find_eta(self, eta): """ Finding the root of sum(xi-h0)/(1+eta(xi-mu)) solves for eta when computing ELR for univariate mean. Parameters ---------- eta : float Lagrange multiplier in the empirical likelihood maximization Returns ------- llr : float n times the log likelihood value for a given value of eta """ return np.sum((self.endog - self.mu0) / \ (1. + eta * (self.endog - self.mu0))) def _ci_limits_mu(self, mu): """ Calculates the difference between the log likelihood of mu_test and a specified critical value. Parameters ---------- mu : float Hypothesized value of the mean. Returns ------- diff : float The difference between the log likelihood value of mu0 and a specified value. """ return self.test_mean(mu)[0] - self.r0 def _find_gamma(self, gamma): """ Finds gamma that satisfies sum(log(n * w(gamma))) - log(r0) = 0 Used for confidence intervals for the mean Parameters ---------- gamma : float Lagrange multiplier when computing confidence interval Returns ------- diff : float The difference between the log-liklihood when the Lagrange multiplier is gamma and a pre-specified value """ denom = np.sum((self.endog - gamma) ** -1) new_weights = (self.endog - gamma) ** -1 / denom return -2 * np.sum(np.log(self.nobs * new_weights)) - \ self.r0 def _opt_var(self, nuisance_mu, pval=False): """ This is the function to be optimized over a nuisance mean parameter to determine the likelihood ratio for the variance Parameters ---------- nuisance_mu : float Value of a nuisance mean parameter Returns ------- llr : float Log likelihood of a pre-specified variance holding the nuisance parameter constant """ endog = self.endog nobs = self.nobs sig_data = ((endog - nuisance_mu) ** 2 \ - self.sig2_0) mu_data = (endog - nuisance_mu) est_vect = np.column_stack((mu_data, sig_data)) eta_star = self._modif_newton(np.array([1. / nobs, 1. / nobs]), est_vect, np.ones(nobs) * (1. / nobs)) denom = 1 + np.dot(eta_star, est_vect.T) self.new_weights = 1. / nobs * 1. / denom llr = np.sum(np.log(nobs * self.new_weights)) if pval: # Used for contour plotting return chi2.sf(-2 * llr, 1) return -2 * llr def _ci_limits_var(self, var): """ Used to determine the confidence intervals for the variance. It calls test_var and when called by an optimizer, finds the value of sig2_0 that is chi2.ppf(significance-level) Parameters ---------- var_test : float Hypothesized value of the variance Returns ------- diff : float The difference between the log likelihood ratio at var_test and a pre-specified value. """ return self.test_var(var)[0] - self.r0 def _opt_skew(self, nuis_params): """ Called by test_skew. This function is optimized over nuisance parameters mu and sigma Parameters ---------- nuis_params : 1darray An array with a nuisance mean and variance parameter Returns ------- llr : float The log likelihood ratio of a pre-specified skewness holding the nuisance parameters constant. """ endog = self.endog nobs = self.nobs mu_data = endog - nuis_params[0] sig_data = ((endog - nuis_params[0]) ** 2) - nuis_params[1] skew_data = ((((endog - nuis_params[0]) ** 3) / (nuis_params[1] ** 1.5))) - self.skew0 est_vect = np.column_stack((mu_data, sig_data, skew_data)) eta_star = self._modif_newton(np.array([1. / nobs, 1. / nobs, 1. / nobs]), est_vect, np.ones(nobs) * (1. / nobs)) denom = 1. + np.dot(eta_star, est_vect.T) self.new_weights = 1. / nobs * 1. / denom llr = np.sum(np.log(nobs * self.new_weights)) return -2 * llr def _opt_kurt(self, nuis_params): """ Called by test_kurt. This function is optimized over nuisance parameters mu and sigma Parameters ---------- nuis_params : 1darray An array with a nuisance mean and variance parameter Returns ------- llr : float The log likelihood ratio of a pre-speified kurtosis holding the nuisance parameters constant """ endog = self.endog nobs = self.nobs mu_data = endog - nuis_params[0] sig_data = ((endog - nuis_params[0]) ** 2) - nuis_params[1] kurt_data = (((((endog - nuis_params[0]) ** 4) / \ (nuis_params[1] ** 2))) - 3) - self.kurt0 est_vect = np.column_stack((mu_data, sig_data, kurt_data)) eta_star = self._modif_newton(np.array([1. / nobs, 1. / nobs, 1. / nobs]), est_vect, np.ones(nobs) * (1. / nobs)) denom = 1 + np.dot(eta_star, est_vect.T) self.new_weights = 1. / nobs * 1. / denom llr = np.sum(np.log(nobs * self.new_weights)) return -2 * llr def _opt_skew_kurt(self, nuis_params): """ Called by test_joint_skew_kurt. This function is optimized over nuisance parameters mu and sigma Parameters ---------- nuis_params : 1darray An array with a nuisance mean and variance parameter Returns ------ llr : float The log likelihood ratio of a pre-speified skewness and kurtosis holding the nuisance parameters constant. """ endog = self.endog nobs = self.nobs mu_data = endog - nuis_params[0] sig_data = ((endog - nuis_params[0]) ** 2) - nuis_params[1] skew_data = ((((endog - nuis_params[0]) ** 3) / \ (nuis_params[1] ** 1.5))) - self.skew0 kurt_data = (((((endog - nuis_params[0]) ** 4) / \ (nuis_params[1] ** 2))) - 3) - self.kurt0 est_vect = np.column_stack((mu_data, sig_data, skew_data, kurt_data)) eta_star = self._modif_newton(np.array([1. / nobs, 1. / nobs, 1. / nobs, 1. / nobs]), est_vect, np.ones(nobs) * (1. / nobs)) denom = 1. + np.dot(eta_star, est_vect.T) self.new_weights = 1. / nobs * 1. / denom llr = np.sum(np.log(nobs * self.new_weights)) return -2 * llr def _ci_limits_skew(self, skew): """ Parameters ---------- skew0 : float Hypothesized value of skewness Returns ------- diff : float The difference between the log likelihood ratio at skew and a pre-specified value. """ return self.test_skew(skew)[0] - self.r0 def _ci_limits_kurt(self, kurt): """ Parameters ---------- skew0 : float Hypothesized value of kurtosis Returns ------- diff : float The difference between the log likelihood ratio at kurt and a pre-specified value. """ return self.test_kurt(kurt)[0] - self.r0 def _opt_correl(self, nuis_params, corr0, endog, nobs, x0, weights0): """ Parameters ---------- nuis_params : 1darray Array containing two nuisance means and two nuisance variances Returns ------- llr : float The log-likelihood of the correlation coefficient holding nuisance parameters constant """ mu1_data, mu2_data = (endog - nuis_params[::2]).T sig1_data = mu1_data ** 2 - nuis_params[1] sig2_data = mu2_data ** 2 - nuis_params[3] correl_data = ((mu1_data * mu2_data) - corr0 * (nuis_params[1] * nuis_params[3]) ** .5) est_vect = np.column_stack((mu1_data, sig1_data, mu2_data, sig2_data, correl_data)) eta_star = self._modif_newton(x0, est_vect, weights0) denom = 1. + np.dot(est_vect, eta_star) self.new_weights = 1. / nobs * 1. / denom llr = np.sum(np.log(nobs * self.new_weights)) return -2 * llr def _ci_limits_corr(self, corr): return self.test_corr(corr)[0] - self.r0 class DescStatUV(_OptFuncts): """ A class to compute confidence intervals and hypothesis tests involving mean, variance, kurtosis and skewness of a univariate random variable. Parameters ---------- endog : 1darray Data to be analyzed Attributes ---------- endog : 1darray Data to be analyzed nobs : float Number of observations """ def __init__(self, endog): self.endog = np.squeeze(endog) self.nobs = endog.shape[0] def test_mean(self, mu0, return_weights=False): """ Returns - 2 x log-likelihood ratio, p-value and weights for a hypothesis test of the mean. Parameters ---------- mu0 : float Mean value to be tested return_weights : bool If return_weights is True the function returns the weights of the observations under the null hypothesis. Default is False Returns ------- test_results : tuple The log-likelihood ratio and p-value of mu0 """ self.mu0 = mu0 endog = self.endog nobs = self.nobs eta_min = (1. - (1. / nobs)) / (self.mu0 - max(endog)) eta_max = (1. - (1. / nobs)) / (self.mu0 - min(endog)) eta_star = optimize.brentq(self._find_eta, eta_min, eta_max) new_weights = (1. / nobs) * 1. / (1. + eta_star * (endog - self.mu0)) llr = -2 * np.sum(np.log(nobs * new_weights)) if return_weights: return llr, chi2.sf(llr, 1), new_weights else: return llr, chi2.sf(llr, 1) def ci_mean(self, sig=.05, method='gamma', epsilon=10 ** -8, gamma_low=-10 ** 10, gamma_high=10 ** 10): """ Returns the confidence interval for the mean. Parameters ---------- sig : float significance level. Default is .05 method : str Root finding method, Can be 'nested-brent' or 'gamma'. Default is 'gamma' 'gamma' Tries to solve for the gamma parameter in the Lagrange (see Owen pg 22) and then determine the weights. 'nested brent' uses brents method to find the confidence intervals but must maximize the likelihood ratio on every iteration. gamma is generally much faster. If the optimizations does not converge, try expanding the gamma_high and gamma_low variable. gamma_low : float Lower bound for gamma when finding lower limit. If function returns f(a) and f(b) must have different signs, consider lowering gamma_low. gamma_high : float Upper bound for gamma when finding upper limit. If function returns f(a) and f(b) must have different signs, consider raising gamma_high. epsilon : float When using 'nested-brent', amount to decrease (increase) from the maximum (minimum) of the data when starting the search. This is to protect against the likelihood ratio being zero at the maximum (minimum) value of the data. If data is very small in absolute value (<10 ``**`` -6) consider shrinking epsilon When using 'gamma', amount to decrease (increase) the minimum (maximum) by to start the search for gamma. If function returns f(a) and f(b) must have different signs, consider lowering epsilon. Returns ------- Interval : tuple Confidence interval for the mean """ endog = self.endog sig = 1 - sig if method == 'nested-brent': self.r0 = chi2.ppf(sig, 1) middle = np.mean(endog) epsilon_u = (max(endog) - np.mean(endog)) * epsilon epsilon_l = (np.mean(endog) - min(endog)) * epsilon ulim = optimize.brentq(self._ci_limits_mu, middle, max(endog) - epsilon_u) llim = optimize.brentq(self._ci_limits_mu, middle, min(endog) + epsilon_l) return llim, ulim if method == 'gamma': self.r0 = chi2.ppf(sig, 1) gamma_star_l = optimize.brentq(self._find_gamma, gamma_low, min(endog) - epsilon) gamma_star_u = optimize.brentq(self._find_gamma, \ max(endog) + epsilon, gamma_high) weights_low = ((endog - gamma_star_l) ** -1) / \ np.sum((endog - gamma_star_l) ** -1) weights_high = ((endog - gamma_star_u) ** -1) / \ np.sum((endog - gamma_star_u) ** -1) mu_low = np.sum(weights_low * endog) mu_high = np.sum(weights_high * endog) return mu_low, mu_high def test_var(self, sig2_0, return_weights=False): """ Returns -2 x log-likelihood ratio and the p-value for the hypothesized variance Parameters ---------- sig2_0 : float Hypothesized variance to be tested return_weights : bool If True, returns the weights that maximize the likelihood of observing sig2_0. Default is False Returns ------- test_results : tuple The log-likelihood ratio and the p_value of sig2_0 Examples -------- >>> import numpy as np >>> import statsmodels.api as sm >>> random_numbers = np.random.standard_normal(1000)*100 >>> el_analysis = sm.emplike.DescStat(random_numbers) >>> hyp_test = el_analysis.test_var(9500) """ self.sig2_0 = sig2_0 mu_max = max(self.endog) mu_min = min(self.endog) llr = optimize.fminbound(self._opt_var, mu_min, mu_max, \ full_output=1)[1] p_val = chi2.sf(llr, 1) if return_weights: return llr, p_val, self.new_weights.T else: return llr, p_val def ci_var(self, lower_bound=None, upper_bound=None, sig=.05): """ Returns the confidence interval for the variance. Parameters ---------- lower_bound : float The minimum value the lower confidence interval can take. The p-value from test_var(lower_bound) must be lower than 1 - significance level. Default is .99 confidence limit assuming normality upper_bound : float The maximum value the upper confidence interval can take. The p-value from test_var(upper_bound) must be lower than 1 - significance level. Default is .99 confidence limit assuming normality sig : float The significance level. Default is .05 Returns ------- Interval : tuple Confidence interval for the variance Examples -------- >>> import numpy as np >>> import statsmodels.api as sm >>> random_numbers = np.random.standard_normal(100) >>> el_analysis = sm.emplike.DescStat(random_numbers) >>> el_analysis.ci_var() (0.7539322567470305, 1.229998852496268) >>> el_analysis.ci_var(.5, 2) (0.7539322567469926, 1.2299988524962664) Notes ----- If the function returns the error f(a) and f(b) must have different signs, consider lowering lower_bound and raising upper_bound. """ endog = self.endog if upper_bound is None: upper_bound = ((self.nobs - 1) * endog.var()) / \ (chi2.ppf(.0001, self.nobs - 1)) if lower_bound is None: lower_bound = ((self.nobs - 1) * endog.var()) / \ (chi2.ppf(.9999, self.nobs - 1)) self.r0 = chi2.ppf(1 - sig, 1) llim = optimize.brentq(self._ci_limits_var, lower_bound, endog.var()) ulim = optimize.brentq(self._ci_limits_var, endog.var(), upper_bound) return llim, ulim def plot_contour(self, mu_low, mu_high, var_low, var_high, mu_step, var_step, levs=[.2, .1, .05, .01, .001]): """ Returns a plot of the confidence region for a univariate mean and variance. Parameters ---------- mu_low : float Lowest value of the mean to plot mu_high : float Highest value of the mean to plot var_low : float Lowest value of the variance to plot var_high : float Highest value of the variance to plot mu_step : float Increments to evaluate the mean var_step : float Increments to evaluate the mean levs : list Which values of significance the contour lines will be drawn. Default is [.2, .1, .05, .01, .001] Returns ------- Figure The contour plot """ fig, ax = utils.create_mpl_ax() ax.set_ylabel('Variance') ax.set_xlabel('Mean') mu_vect = list(np.arange(mu_low, mu_high, mu_step)) var_vect = list(np.arange(var_low, var_high, var_step)) z = [] for sig0 in var_vect: self.sig2_0 = sig0 for mu0 in mu_vect: z.append(self._opt_var(mu0, pval=True)) z = np.asarray(z).reshape(len(var_vect), len(mu_vect)) ax.contour(mu_vect, var_vect, z, levels=levs) return fig def test_skew(self, skew0, return_weights=False): """ Returns -2 x log-likelihood and p-value for the hypothesized skewness. Parameters ---------- skew0 : float Skewness value to be tested return_weights : bool If True, function also returns the weights that maximize the likelihood ratio. Default is False. Returns ------- test_results : tuple The log-likelihood ratio and p_value of skew0 """ self.skew0 = skew0 start_nuisance = np.array([self.endog.mean(), self.endog.var()]) llr = optimize.fmin_powell(self._opt_skew, start_nuisance, full_output=1, disp=0)[1] p_val = chi2.sf(llr, 1) if return_weights: return llr, p_val, self.new_weights.T return llr, p_val def test_kurt(self, kurt0, return_weights=False): """ Returns -2 x log-likelihood and the p-value for the hypothesized kurtosis. Parameters ---------- kurt0 : float Kurtosis value to be tested return_weights : bool If True, function also returns the weights that maximize the likelihood ratio. Default is False. Returns ------- test_results : tuple The log-likelihood ratio and p-value of kurt0 """ self.kurt0 = kurt0 start_nuisance = np.array([self.endog.mean(), self.endog.var()]) llr = optimize.fmin_powell(self._opt_kurt, start_nuisance, full_output=1, disp=0)[1] p_val = chi2.sf(llr, 1) if return_weights: return llr, p_val, self.new_weights.T return llr, p_val def test_joint_skew_kurt(self, skew0, kurt0, return_weights=False): """ Returns - 2 x log-likelihood and the p-value for the joint hypothesis test for skewness and kurtosis Parameters ---------- skew0 : float Skewness value to be tested kurt0 : float Kurtosis value to be tested return_weights : bool If True, function also returns the weights that maximize the likelihood ratio. Default is False. Returns ------- test_results : tuple The log-likelihood ratio and p-value of the joint hypothesis test. """ self.skew0 = skew0 self.kurt0 = kurt0 start_nuisance = np.array([self.endog.mean(), self.endog.var()]) llr = optimize.fmin_powell(self._opt_skew_kurt, start_nuisance, full_output=1, disp=0)[1] p_val = chi2.sf(llr, 2) if return_weights: return llr, p_val, self.new_weights.T return llr, p_val def ci_skew(self, sig=.05, upper_bound=None, lower_bound=None): """ Returns the confidence interval for skewness. Parameters ---------- sig : float The significance level. Default is .05 upper_bound : float Maximum value of skewness the upper limit can be. Default is .99 confidence limit assuming normality. lower_bound : float Minimum value of skewness the lower limit can be. Default is .99 confidence level assuming normality. Returns ------- Interval : tuple Confidence interval for the skewness Notes ----- If function returns f(a) and f(b) must have different signs, consider expanding lower and upper bounds """ nobs = self.nobs endog = self.endog if upper_bound is None: upper_bound = skew(endog) + \ 2.5 * ((6. * nobs * (nobs - 1.)) / \ ((nobs - 2.) * (nobs + 1.) * \ (nobs + 3.))) ** .5 if lower_bound is None: lower_bound = skew(endog) - \ 2.5 * ((6. * nobs * (nobs - 1.)) / \ ((nobs - 2.) * (nobs + 1.) * \ (nobs + 3.))) ** .5 self.r0 = chi2.ppf(1 - sig, 1) llim = optimize.brentq(self._ci_limits_skew, lower_bound, skew(endog)) ulim = optimize.brentq(self._ci_limits_skew, skew(endog), upper_bound) return llim, ulim def ci_kurt(self, sig=.05, upper_bound=None, lower_bound=None): """ Returns the confidence interval for kurtosis. Parameters ---------- sig : float The significance level. Default is .05 upper_bound : float Maximum value of kurtosis the upper limit can be. Default is .99 confidence limit assuming normality. lower_bound : float Minimum value of kurtosis the lower limit can be. Default is .99 confidence limit assuming normality. Returns ------- Interval : tuple Lower and upper confidence limit Notes ----- For small n, upper_bound and lower_bound may have to be provided by the user. Consider using test_kurt to find values close to the desired significance level. If function returns f(a) and f(b) must have different signs, consider expanding the bounds. """ endog = self.endog nobs = self.nobs if upper_bound is None: upper_bound = kurtosis(endog) + \ (2.5 * (2. * ((6. * nobs * (nobs - 1.)) / \ ((nobs - 2.) * (nobs + 1.) * \ (nobs + 3.))) ** .5) * \ (((nobs ** 2.) - 1.) / ((nobs - 3.) *\ (nobs + 5.))) ** .5) if lower_bound is None: lower_bound = kurtosis(endog) - \ (2.5 * (2. * ((6. * nobs * (nobs - 1.)) / \ ((nobs - 2.) * (nobs + 1.) * \ (nobs + 3.))) ** .5) * \ (((nobs ** 2.) - 1.) / ((nobs - 3.) *\ (nobs + 5.))) ** .5) self.r0 = chi2.ppf(1 - sig, 1) llim = optimize.brentq(self._ci_limits_kurt, lower_bound, \ kurtosis(endog)) ulim = optimize.brentq(self._ci_limits_kurt, kurtosis(endog), \ upper_bound) return llim, ulim class DescStatMV(_OptFuncts): """ A class for conducting inference on multivariate means and correlation. Parameters ---------- endog : ndarray Data to be analyzed Attributes ---------- endog : ndarray Data to be analyzed nobs : float Number of observations """ def __init__(self, endog): self.endog = endog self.nobs = endog.shape[0] def mv_test_mean(self, mu_array, return_weights=False): """ Returns -2 x log likelihood and the p-value for a multivariate hypothesis test of the mean Parameters ---------- mu_array : 1d array Hypothesized values for the mean. Must have same number of elements as columns in endog return_weights : bool If True, returns the weights that maximize the likelihood of mu_array. Default is False. Returns ------- test_results : tuple The log-likelihood ratio and p-value for mu_array """ endog = self.endog nobs = self.nobs if len(mu_array) != endog.shape[1]: raise ValueError('mu_array must have the same number of ' 'elements as the columns of the data.') mu_array = mu_array.reshape(1, endog.shape[1]) means = np.ones((endog.shape[0], endog.shape[1])) means = mu_array * means est_vect = endog - means start_vals = 1. / nobs * np.ones(endog.shape[1]) eta_star = self._modif_newton(start_vals, est_vect, np.ones(nobs) * (1. / nobs)) denom = 1 + np.dot(eta_star, est_vect.T) self.new_weights = 1 / nobs * 1 / denom llr = -2 * np.sum(np.log(nobs * self.new_weights)) p_val = chi2.sf(llr, mu_array.shape[1]) if return_weights: return llr, p_val, self.new_weights.T else: return llr, p_val def mv_mean_contour(self, mu1_low, mu1_upp, mu2_low, mu2_upp, step1, step2, levs=(.001, .01, .05, .1, .2), var1_name=None, var2_name=None, plot_dta=False): """ Creates a confidence region plot for the mean of bivariate data Parameters ---------- m1_low : float Minimum value of the mean for variable 1 m1_upp : float Maximum value of the mean for variable 1 mu2_low : float Minimum value of the mean for variable 2 mu2_upp : float Maximum value of the mean for variable 2 step1 : float Increment of evaluations for variable 1 step2 : float Increment of evaluations for variable 2 levs : list Levels to be drawn on the contour plot. Default = (.001, .01, .05, .1, .2) plot_dta : bool If True, makes a scatter plot of the data on top of the contour plot. Defaultis False. var1_name : str Name of variable 1 to be plotted on the x-axis var2_name : str Name of variable 2 to be plotted on the y-axis Notes ----- The smaller the step size, the more accurate the intervals will be If the function returns optimization failed, consider narrowing the boundaries of the plot Examples -------- >>> import statsmodels.api as sm >>> two_rvs = np.random.standard_normal((20,2)) >>> el_analysis = sm.emplike.DescStat(two_rvs) >>> contourp = el_analysis.mv_mean_contour(-2, 2, -2, 2, .1, .1) >>> contourp.show() """ if self.endog.shape[1] != 2: raise ValueError('Data must contain exactly two variables') fig, ax = utils.create_mpl_ax() if var2_name is None: ax.set_ylabel('Variable 2') else: ax.set_ylabel(var2_name) if var1_name is None: ax.set_xlabel('Variable 1') else: ax.set_xlabel(var1_name) x = np.arange(mu1_low, mu1_upp, step1) y = np.arange(mu2_low, mu2_upp, step2) pairs = itertools.product(x, y) z = [] for i in pairs: z.append(self.mv_test_mean(np.asarray(i))[0]) X, Y = np.meshgrid(x, y) z = np.asarray(z) z = z.reshape(X.shape[1], Y.shape[0]) ax.contour(x, y, z.T, levels=levs) if plot_dta: ax.plot(self.endog[:, 0], self.endog[:, 1], 'bo') return fig def test_corr(self, corr0, return_weights=0): """ Returns -2 x log-likelihood ratio and p-value for the correlation coefficient between 2 variables Parameters ---------- corr0 : float Hypothesized value to be tested return_weights : bool If true, returns the weights that maximize the log-likelihood at the hypothesized value """ nobs = self.nobs endog = self.endog if endog.shape[1] != 2: raise NotImplementedError('Correlation matrix not yet implemented') nuis0 = np.array([endog[:, 0].mean(), endog[:, 0].var(), endog[:, 1].mean(), endog[:, 1].var()]) x0 = np.zeros(5) weights0 = np.array([1. / nobs] * int(nobs)) args = (corr0, endog, nobs, x0, weights0) llr = optimize.fmin(self._opt_correl, nuis0, args=args, full_output=1, disp=0)[1] p_val = chi2.sf(llr, 1) if return_weights: return llr, p_val, self.new_weights.T return llr, p_val def ci_corr(self, sig=.05, upper_bound=None, lower_bound=None): """ Returns the confidence intervals for the correlation coefficient Parameters ---------- sig : float The significance level. Default is .05 upper_bound : float Maximum value the upper confidence limit can be. Default is 99% confidence limit assuming normality. lower_bound : float Minimum value the lower confidence limit can be. Default is 99% confidence limit assuming normality. Returns ------- interval : tuple Confidence interval for the correlation """ endog = self.endog nobs = self.nobs self.r0 = chi2.ppf(1 - sig, 1) point_est = np.corrcoef(endog[:, 0], endog[:, 1])[0, 1] if upper_bound is None: upper_bound = min(.999, point_est + \ 2.5 * ((1. - point_est ** 2.) / \ (nobs - 2.)) ** .5) if lower_bound is None: lower_bound = max(- .999, point_est - \ 2.5 * (np.sqrt((1. - point_est ** 2.) / \ (nobs - 2.)))) llim = optimize.brenth(self._ci_limits_corr, lower_bound, point_est) ulim = optimize.brenth(self._ci_limits_corr, point_est, upper_bound) return llim, ulim