'''gradient/Jacobian of normal and t loglikelihood use chain rule normal derivative wrt mu, sigma and beta new version: loc-scale distributions, derivative wrt loc, scale also includes "standardized" t distribution (for use in GARCH) TODO: * use sympy for derivative of loglike wrt shape parameters it works for df of t distribution dlog(gamma(a))da = polygamma(0,a) check polygamma is available in scipy.special * get loc-scale example to work with mean = X*b * write some full unit test examples A: josef-pktd ''' import numpy as np from scipy import special from scipy.special import gammaln def norm_lls(y, params): '''normal loglikelihood given observations and mean mu and variance sigma2 Parameters ---------- y : ndarray, 1d normally distributed random variable params : ndarray, (nobs, 2) array of mean, variance (mu, sigma2) with observations in rows Returns ------- lls : ndarray contribution to loglikelihood for each observation ''' mu, sigma2 = params.T lls = -0.5*(np.log(2*np.pi) + np.log(sigma2) + (y-mu)**2/sigma2) return lls def norm_lls_grad(y, params): '''Jacobian of normal loglikelihood wrt mean mu and variance sigma2 Parameters ---------- y : ndarray, 1d normally distributed random variable params : ndarray, (nobs, 2) array of mean, variance (mu, sigma2) with observations in rows Returns ------- grad : array (nobs, 2) derivative of loglikelihood for each observation wrt mean in first column, and wrt variance in second column Notes ----- this is actually the derivative wrt sigma not sigma**2, but evaluated with parameter sigma2 = sigma**2 ''' mu, sigma2 = params.T dllsdmu = (y-mu)/sigma2 dllsdsigma2 = ((y-mu)**2/sigma2 - 1)/np.sqrt(sigma2) return np.column_stack((dllsdmu, dllsdsigma2)) def mean_grad(x, beta): '''gradient/Jacobian for d (x*beta)/ d beta ''' return x def normgrad(y, x, params): '''Jacobian of normal loglikelihood wrt mean mu and variance sigma2 Parameters ---------- y : ndarray, 1d normally distributed random variable with mean x*beta, and variance sigma2 x : ndarray, 2d explanatory variables, observation in rows, variables in columns params : array_like, (nvars + 1) array of coefficients and variance (beta, sigma2) Returns ------- grad : array (nobs, 2) derivative of loglikelihood for each observation wrt mean in first column, and wrt scale (sigma) in second column assume params = (beta, sigma2) Notes ----- TODO: for heteroscedasticity need sigma to be a 1d array ''' beta = params[:-1] sigma2 = params[-1]*np.ones((len(y),1)) dmudbeta = mean_grad(x, beta) mu = np.dot(x, beta) #print(beta, sigma2) params2 = np.column_stack((mu,sigma2)) dllsdms = norm_lls_grad(y,params2) grad = np.column_stack((dllsdms[:,:1]*dmudbeta, dllsdms[:,:1])) return grad def tstd_lls(y, params, df): '''t loglikelihood given observations and mean mu and variance sigma2 = 1 Parameters ---------- y : ndarray, 1d normally distributed random variable params : ndarray, (nobs, 2) array of mean, variance (mu, sigma2) with observations in rows df : int degrees of freedom of the t distribution Returns ------- lls : ndarray contribution to loglikelihood for each observation Notes ----- parametrized for garch ''' mu, sigma2 = params.T df = df*1.0 #lls = gammaln((df+1)/2.) - gammaln(df/2.) - 0.5*np.log((df-2)*np.pi) #lls -= (df+1)/2. * np.log(1. + (y-mu)**2/(df-2.)/sigma2) + 0.5 * np.log(sigma2) lls = gammaln((df+1)/2.) - gammaln(df/2.) - 0.5*np.log((df-2)*np.pi) lls -= (df+1)/2. * np.log(1. + (y-mu)**2/(df-2)/sigma2) + 0.5 * np.log(sigma2) return lls def norm_dlldy(y): '''derivative of log pdf of standard normal with respect to y ''' return -y def tstd_pdf(x, df): '''pdf for standardized (not standard) t distribution, variance is one ''' r = np.array(df*1.0) Px = np.exp(special.gammaln((r+1)/2.)-special.gammaln(r/2.))/np.sqrt((r-2)*np.pi) Px /= (1+(x**2)/(r-2))**((r+1)/2.) return Px def ts_lls(y, params, df): '''t loglikelihood given observations and mean mu and variance sigma2 = 1 Parameters ---------- y : ndarray, 1d normally distributed random variable params : ndarray, (nobs, 2) array of mean, variance (mu, sigma2) with observations in rows df : int degrees of freedom of the t distribution Returns ------- lls : ndarray contribution to loglikelihood for each observation Notes ----- parametrized for garch normalized/rescaled so that sigma2 is the variance >>> df = 10; sigma = 1. >>> stats.t.stats(df, loc=0., scale=sigma.*np.sqrt((df-2.)/df)) (array(0.0), array(1.0)) >>> sigma = np.sqrt(2.) >>> stats.t.stats(df, loc=0., scale=sigma*np.sqrt((df-2.)/df)) (array(0.0), array(2.0)) ''' print(y, params, df) mu, sigma2 = params.T df = df*1.0 #lls = gammaln((df+1)/2.) - gammaln(df/2.) - 0.5*np.log((df-2)*np.pi) #lls -= (df+1)/2. * np.log(1. + (y-mu)**2/(df-2.)/sigma2) + 0.5 * np.log(sigma2) lls = gammaln((df+1)/2.) - gammaln(df/2.) - 0.5*np.log((df)*np.pi) lls -= (df+1.)/2. * np.log(1. + (y-mu)**2/(df)/sigma2) + 0.5 * np.log(sigma2) return lls def ts_dlldy(y, df): '''derivative of log pdf of standard t with respect to y Parameters ---------- y : array_like data points of random variable at which loglike is evaluated df : array_like degrees of freedom,shape parameters of log-likelihood function of t distribution Returns ------- dlldy : ndarray derivative of loglikelihood wrt random variable y evaluated at the points given in y Notes ----- with mean 0 and scale 1, but variance is df/(df-2) ''' df = df*1. #(df+1)/2. / (1 + y**2/(df-2.)) * 2.*y/(df-2.) #return -(df+1)/(df-2.) / (1 + y**2/(df-2.)) * y return -(df+1)/(df) / (1 + y**2/(df)) * y def tstd_dlldy(y, df): '''derivative of log pdf of standardized t with respect to y Parameters ---------- y : array_like data points of random variable at which loglike is evaluated df : array_like degrees of freedom,shape parameters of log-likelihood function of t distribution Returns ------- dlldy : ndarray derivative of loglikelihood wrt random variable y evaluated at the points given in y Notes ----- parametrized for garch, standardized to variance=1 ''' #(df+1)/2. / (1 + y**2/(df-2.)) * 2.*y/(df-2.) return -(df+1)/(df-2.) / (1 + y**2/(df-2.)) * y #return (df+1)/(df) / (1 + y**2/(df)) * y def locscale_grad(y, loc, scale, dlldy, *args): '''derivative of log-likelihood with respect to location and scale Parameters ---------- y : array_like data points of random variable at which loglike is evaluated loc : float location parameter of distribution scale : float scale parameter of distribution dlldy : function derivative of loglikelihood fuction wrt. random variable x args : array_like shape parameters of log-likelihood function Returns ------- dlldloc : ndarray derivative of loglikelihood wrt location evaluated at the points given in y dlldscale : ndarray derivative of loglikelihood wrt scale evaluated at the points given in y ''' yst = (y-loc)/scale #ystandardized dlldloc = -dlldy(yst, *args) / scale dlldscale = -1./scale - dlldy(yst, *args) * (y-loc)/scale**2 return dlldloc, dlldscale if __name__ == '__main__': verbose = 0 if verbose: sig = 0.1 beta = np.ones(2) rvs = np.random.randn(10,3) x = rvs[:,1:] y = np.dot(x,beta) + sig*rvs[:,0] params = [1,1,1] print(normgrad(y, x, params)) dllfdbeta = (y-np.dot(x, beta))[:,None]*x #for sigma = 1 print(dllfdbeta) print(locscale_grad(y, np.dot(x, beta), 1, norm_dlldy)) print(y-np.dot(x, beta)) from scipy import stats, misc def llt(y,loc,scale,df): return np.log(stats.t.pdf(y, df, loc=loc, scale=scale)) def lltloc(loc,y,scale,df): return np.log(stats.t.pdf(y, df, loc=loc, scale=scale)) def lltscale(scale,y,loc,df): return np.log(stats.t.pdf(y, df, loc=loc, scale=scale)) def llnorm(y,loc,scale): return np.log(stats.norm.pdf(y, loc=loc, scale=scale)) def llnormloc(loc,y,scale): return np.log(stats.norm.pdf(y, loc=loc, scale=scale)) def llnormscale(scale,y,loc): return np.log(stats.norm.pdf(y, loc=loc, scale=scale)) if verbose: print('\ngradient of t') print(misc.derivative(llt, 1, dx=1e-6, n=1, args=(0,1,10), order=3)) print('t ', locscale_grad(1, 0, 1, tstd_dlldy, 10)) print('ts', locscale_grad(1, 0, 1, ts_dlldy, 10)) print(misc.derivative(llt, 1.5, dx=1e-10, n=1, args=(0,1,20), order=3),) print('ts', locscale_grad(1.5, 0, 1, ts_dlldy, 20)) print(misc.derivative(llt, 1.5, dx=1e-10, n=1, args=(0,2,20), order=3),) print('ts', locscale_grad(1.5, 0, 2, ts_dlldy, 20)) print(misc.derivative(llt, 1.5, dx=1e-10, n=1, args=(1,2,20), order=3),) print('ts', locscale_grad(1.5, 1, 2, ts_dlldy, 20)) print(misc.derivative(lltloc, 1, dx=1e-10, n=1, args=(1.5,2,20), order=3),) print(misc.derivative(lltscale, 2, dx=1e-10, n=1, args=(1.5,1,20), order=3)) y,loc,scale,df = 1.5, 1, 2, 20 print('ts', locscale_grad(y,loc,scale, ts_dlldy, 20)) print(misc.derivative(lltloc, loc, dx=1e-10, n=1, args=(y,scale,df), order=3),) print(misc.derivative(lltscale, scale, dx=1e-10, n=1, args=(y,loc,df), order=3)) print('\ngradient of norm') print(misc.derivative(llnorm, 1, dx=1e-6, n=1, args=(0,1), order=3)) print(locscale_grad(1, 0, 1, norm_dlldy)) y,loc,scale = 1.5, 1, 2 print('ts', locscale_grad(y,loc,scale, norm_dlldy)) print(misc.derivative(llnormloc, loc, dx=1e-10, n=1, args=(y,scale), order=3),) print(misc.derivative(llnormscale, scale, dx=1e-10, n=1, args=(y,loc), order=3)) y,loc,scale = 1.5, 0, 1 print('ts', locscale_grad(y,loc,scale, norm_dlldy)) print(misc.derivative(llnormloc, loc, dx=1e-10, n=1, args=(y,scale), order=3),) print(misc.derivative(llnormscale, scale, dx=1e-10, n=1, args=(y,loc), order=3)) #print('still something wrong with handling of scale and variance' #looks ok now print('\nloglike of t') print(tstd_lls(1, np.array([0,1]), 100), llt(1,0,1,100), 'differently standardized') print(tstd_lls(1, np.array([0,1]), 10), llt(1,0,1,10), 'differently standardized') print(ts_lls(1, np.array([0,1]), 10), llt(1,0,1,10)) print(tstd_lls(1, np.array([0,1.*10./8.]), 10), llt(1.,0,1.,10)) print(ts_lls(1, np.array([0,1]), 100), llt(1,0,1,100)) print(tstd_lls(1, np.array([0,1]), 10), llt(1,0,1.*np.sqrt(8/10.),10)) from numpy.testing import assert_almost_equal params =[(0, 1), (1.,1.), (0.,2.), ( 1., 2.)] yt = np.linspace(-2.,2.,11) for loc,scale in params: dlldlo = misc.derivative(llnormloc, loc, dx=1e-10, n=1, args=(yt,scale), order=3) dlldsc = misc.derivative(llnormscale, scale, dx=1e-10, n=1, args=(yt,loc), order=3) gr = locscale_grad(yt, loc, scale, norm_dlldy) assert_almost_equal(dlldlo, gr[0], 5, err_msg='deriv loc') assert_almost_equal(dlldsc, gr[1], 5, err_msg='deriv scale') for df in [3, 10, 100]: for loc,scale in params: dlldlo = misc.derivative(lltloc, loc, dx=1e-10, n=1, args=(yt,scale,df), order=3) dlldsc = misc.derivative(lltscale, scale, dx=1e-10, n=1, args=(yt,loc,df), order=3) gr = locscale_grad(yt, loc, scale, ts_dlldy, df) assert_almost_equal(dlldlo, gr[0], 4, err_msg='deriv loc') assert_almost_equal(dlldsc, gr[1], 4, err_msg='deriv scale') assert_almost_equal(ts_lls(yt, np.array([loc, scale**2]), df), llt(yt,loc,scale,df), 5, err_msg='loglike') assert_almost_equal(tstd_lls(yt, np.array([loc, scale**2]), df), llt(yt,loc,scale*np.sqrt((df-2.)/df),df), 5, err_msg='loglike')