"""numerical differentiation function, gradient, Jacobian, and Hessian Author : josef-pkt License : BSD Notes ----- These are simple forward differentiation, so that we have them available without dependencies. * Jacobian should be faster than numdifftools because it does not use loop over observations. * numerical precision will vary and depend on the choice of stepsizes """ # TODO: # * some cleanup # * check numerical accuracy (and bugs) with numdifftools and analytical # derivatives # - linear least squares case: (hess - 2*X'X) is 1e-8 or so # - gradient and Hessian agree with numdifftools when evaluated away from # minimum # - forward gradient, Jacobian evaluated at minimum is inaccurate, centered # (+/- epsilon) is ok # * dot product of Jacobian is different from Hessian, either wrong example or # a bug (unlikely), or a real difference # # # What are the conditions that Jacobian dotproduct and Hessian are the same? # # See also: # # BHHH: Greene p481 17.4.6, MLE Jacobian = d loglike / d beta , where loglike # is vector for each observation # see also example 17.4 when J'J is very different from Hessian # also does it hold only at the minimum, what's relationship to covariance # of Jacobian matrix # http://projects.scipy.org/scipy/ticket/1157 # https://en.wikipedia.org/wiki/Levenberg%E2%80%93Marquardt_algorithm # objective: sum((y-f(beta,x)**2), Jacobian = d f/d beta # and not d objective/d beta as in MLE Greene # similar: http://crsouza.blogspot.com/2009/11/neural-network-learning-by-levenberg_18.html#hessian # # in example: if J = d x*beta / d beta then J'J == X'X # similar to https://en.wikipedia.org/wiki/Levenberg%E2%80%93Marquardt_algorithm import numpy as np from statsmodels.compat.pandas import Appender, Substitution # NOTE: we only do double precision internally so far EPS = np.finfo(float).eps _hessian_docs = """ Calculate Hessian with finite difference derivative approximation Parameters ---------- x : array_like value at which function derivative is evaluated f : function function of one array f(x, `*args`, `**kwargs`) epsilon : float or array_like, optional Stepsize used, if None, then stepsize is automatically chosen according to EPS**(1/%(scale)s)*x. args : tuple Arguments for function `f`. kwargs : dict Keyword arguments for function `f`. %(extra_params)s Returns ------- hess : ndarray array of partial second derivatives, Hessian %(extra_returns)s Notes ----- Equation (%(equation_number)s) in Ridout. Computes the Hessian as:: %(equation)s where e[j] is a vector with element j == 1 and the rest are zero and d[i] is epsilon[i]. References ----------: Ridout, M.S. (2009) Statistical applications of the complex-step method of numerical differentiation. The American Statistician, 63, 66-74 """ def _get_epsilon(x, s, epsilon, n): if epsilon is None: h = EPS**(1. / s) * np.maximum(np.abs(x), 0.1) else: if np.isscalar(epsilon): h = np.empty(n) h.fill(epsilon) else: # pragma : no cover h = np.asarray(epsilon) if h.shape != x.shape: raise ValueError("If h is not a scalar it must have the same" " shape as x.") return h def approx_fprime(x, f, epsilon=None, args=(), kwargs={}, centered=False): ''' Gradient of function, or Jacobian if function f returns 1d array Parameters ---------- x : ndarray parameters at which the derivative is evaluated f : function `f(*((x,)+args), **kwargs)` returning either one value or 1d array epsilon : float, optional Stepsize, if None, optimal stepsize is used. This is EPS**(1/2)*x for `centered` == False and EPS**(1/3)*x for `centered` == True. args : tuple Tuple of additional arguments for function `f`. kwargs : dict Dictionary of additional keyword arguments for function `f`. centered : bool Whether central difference should be returned. If not, does forward differencing. Returns ------- grad : ndarray gradient or Jacobian Notes ----- If f returns a 1d array, it returns a Jacobian. If a 2d array is returned by f (e.g., with a value for each observation), it returns a 3d array with the Jacobian of each observation with shape xk x nobs x xk. I.e., the Jacobian of the first observation would be [:, 0, :] ''' n = len(x) f0 = f(*((x,)+args), **kwargs) dim = np.atleast_1d(f0).shape # it could be a scalar grad = np.zeros((n,) + dim, np.promote_types(float, x.dtype)) ei = np.zeros((n,), float) if not centered: epsilon = _get_epsilon(x, 2, epsilon, n) for k in range(n): ei[k] = epsilon[k] grad[k, :] = (f(*((x+ei,) + args), **kwargs) - f0)/epsilon[k] ei[k] = 0.0 else: epsilon = _get_epsilon(x, 3, epsilon, n) / 2. for k in range(n): ei[k] = epsilon[k] grad[k, :] = (f(*((x+ei,)+args), **kwargs) - f(*((x-ei,)+args), **kwargs))/(2 * epsilon[k]) ei[k] = 0.0 return grad.squeeze().T def _approx_fprime_scalar(x, f, epsilon=None, args=(), kwargs={}, centered=False): ''' Gradient of function vectorized for scalar parameter. This assumes that the function ``f`` is vectorized for a scalar parameter. The function value ``f(x)`` has then the same shape as the input ``x``. The derivative returned by this function also has the same shape as ``x``. Parameters ---------- x : ndarray Parameters at which the derivative is evaluated. f : function `f(*((x,)+args), **kwargs)` returning either one value or 1d array epsilon : float, optional Stepsize, if None, optimal stepsize is used. This is EPS**(1/2)*x for `centered` == False and EPS**(1/3)*x for `centered` == True. args : tuple Tuple of additional arguments for function `f`. kwargs : dict Dictionary of additional keyword arguments for function `f`. centered : bool Whether central difference should be returned. If not, does forward differencing. Returns ------- grad : ndarray Array of derivatives, gradient evaluated at parameters ``x``. ''' x = np.asarray(x) n = 1 f0 = f(*((x,)+args), **kwargs) if not centered: eps = _get_epsilon(x, 2, epsilon, n) grad = (f(*((x+eps,) + args), **kwargs) - f0) / eps else: eps = _get_epsilon(x, 3, epsilon, n) / 2. grad = (f(*((x+eps,)+args), **kwargs) - f(*((x-eps,)+args), **kwargs)) / (2 * eps) return grad def approx_fprime_cs(x, f, epsilon=None, args=(), kwargs={}): ''' Calculate gradient or Jacobian with complex step derivative approximation Parameters ---------- x : ndarray parameters at which the derivative is evaluated f : function `f(*((x,)+args), **kwargs)` returning either one value or 1d array epsilon : float, optional Stepsize, if None, optimal stepsize is used. Optimal step-size is EPS*x. See note. args : tuple Tuple of additional arguments for function `f`. kwargs : dict Dictionary of additional keyword arguments for function `f`. Returns ------- partials : ndarray array of partial derivatives, Gradient or Jacobian Notes ----- The complex-step derivative has truncation error O(epsilon**2), so truncation error can be eliminated by choosing epsilon to be very small. The complex-step derivative avoids the problem of round-off error with small epsilon because there is no subtraction. ''' # From Guilherme P. de Freitas, numpy mailing list # May 04 2010 thread "Improvement of performance" # http://mail.scipy.org/pipermail/numpy-discussion/2010-May/050250.html n = len(x) epsilon = _get_epsilon(x, 1, epsilon, n) increments = np.identity(n) * 1j * epsilon # TODO: see if this can be vectorized, but usually dim is small partials = [f(x+ih, *args, **kwargs).imag / epsilon[i] for i, ih in enumerate(increments)] return np.array(partials).T def _approx_fprime_cs_scalar(x, f, epsilon=None, args=(), kwargs={}): ''' Calculate gradient for scalar parameter with complex step derivatives. This assumes that the function ``f`` is vectorized for a scalar parameter. The function value ``f(x)`` has then the same shape as the input ``x``. The derivative returned by this function also has the same shape as ``x``. Parameters ---------- x : ndarray Parameters at which the derivative is evaluated. f : function `f(*((x,)+args), **kwargs)` returning either one value or 1d array. epsilon : float, optional Stepsize, if None, optimal stepsize is used. Optimal step-size is EPS*x. See note. args : tuple Tuple of additional arguments for function `f`. kwargs : dict Dictionary of additional keyword arguments for function `f`. Returns ------- partials : ndarray Array of derivatives, gradient evaluated for parameters ``x``. Notes ----- The complex-step derivative has truncation error O(epsilon**2), so truncation error can be eliminated by choosing epsilon to be very small. The complex-step derivative avoids the problem of round-off error with small epsilon because there is no subtraction. ''' # From Guilherme P. de Freitas, numpy mailing list # May 04 2010 thread "Improvement of performance" # http://mail.scipy.org/pipermail/numpy-discussion/2010-May/050250.html x = np.asarray(x) n = x.shape[-1] epsilon = _get_epsilon(x, 1, epsilon, n) eps = 1j * epsilon partials = f(x + eps, *args, **kwargs).imag / epsilon return np.array(partials) def approx_hess_cs(x, f, epsilon=None, args=(), kwargs={}): '''Calculate Hessian with complex-step derivative approximation Parameters ---------- x : array_like value at which function derivative is evaluated f : function function of one array f(x) epsilon : float stepsize, if None, then stepsize is automatically chosen Returns ------- hess : ndarray array of partial second derivatives, Hessian Notes ----- based on equation 10 in M. S. RIDOUT: Statistical Applications of the Complex-step Method of Numerical Differentiation, University of Kent, Canterbury, Kent, U.K. The stepsize is the same for the complex and the finite difference part. ''' # TODO: might want to consider lowering the step for pure derivatives n = len(x) h = _get_epsilon(x, 3, epsilon, n) ee = np.diag(h) hess = np.outer(h, h) n = len(x) for i in range(n): for j in range(i, n): hess[i, j] = (f(*((x + 1j*ee[i, :] + ee[j, :],) + args), **kwargs) - f(*((x + 1j*ee[i, :] - ee[j, :],)+args), **kwargs)).imag/2./hess[i, j] hess[j, i] = hess[i, j] return hess @Substitution( scale="3", extra_params="""return_grad : bool Whether or not to also return the gradient """, extra_returns="""grad : nparray Gradient if return_grad == True """, equation_number="7", equation="""1/(d_j*d_k) * ((f(x + d[j]*e[j] + d[k]*e[k]) - f(x + d[j]*e[j]))) """ ) @Appender(_hessian_docs) def approx_hess1(x, f, epsilon=None, args=(), kwargs={}, return_grad=False): n = len(x) h = _get_epsilon(x, 3, epsilon, n) ee = np.diag(h) f0 = f(*((x,)+args), **kwargs) # Compute forward step g = np.zeros(n) for i in range(n): g[i] = f(*((x+ee[i, :],)+args), **kwargs) hess = np.outer(h, h) # this is now epsilon**2 # Compute "double" forward step for i in range(n): for j in range(i, n): hess[i, j] = (f(*((x + ee[i, :] + ee[j, :],) + args), **kwargs) - g[i] - g[j] + f0)/hess[i, j] hess[j, i] = hess[i, j] if return_grad: grad = (g - f0)/h return hess, grad else: return hess @Substitution( scale="3", extra_params="""return_grad : bool Whether or not to also return the gradient """, extra_returns="""grad : ndarray Gradient if return_grad == True """, equation_number="8", equation="""1/(2*d_j*d_k) * ((f(x + d[j]*e[j] + d[k]*e[k]) - f(x + d[j]*e[j])) - (f(x + d[k]*e[k]) - f(x)) + (f(x - d[j]*e[j] - d[k]*e[k]) - f(x + d[j]*e[j])) - (f(x - d[k]*e[k]) - f(x))) """ ) @Appender(_hessian_docs) def approx_hess2(x, f, epsilon=None, args=(), kwargs={}, return_grad=False): # n = len(x) # NOTE: ridout suggesting using eps**(1/4)*theta h = _get_epsilon(x, 3, epsilon, n) ee = np.diag(h) f0 = f(*((x,)+args), **kwargs) # Compute forward step g = np.zeros(n) gg = np.zeros(n) for i in range(n): g[i] = f(*((x+ee[i, :],)+args), **kwargs) gg[i] = f(*((x-ee[i, :],)+args), **kwargs) hess = np.outer(h, h) # this is now epsilon**2 # Compute "double" forward step for i in range(n): for j in range(i, n): hess[i, j] = (f(*((x + ee[i, :] + ee[j, :],) + args), **kwargs) - g[i] - g[j] + f0 + f(*((x - ee[i, :] - ee[j, :],) + args), **kwargs) - gg[i] - gg[j] + f0)/(2 * hess[i, j]) hess[j, i] = hess[i, j] if return_grad: grad = (g - f0)/h return hess, grad else: return hess @Substitution( scale="4", extra_params="", extra_returns="", equation_number="9", equation="""1/(4*d_j*d_k) * ((f(x + d[j]*e[j] + d[k]*e[k]) - f(x + d[j]*e[j] - d[k]*e[k])) - (f(x - d[j]*e[j] + d[k]*e[k]) - f(x - d[j]*e[j] - d[k]*e[k]))""" ) @Appender(_hessian_docs) def approx_hess3(x, f, epsilon=None, args=(), kwargs={}): n = len(x) h = _get_epsilon(x, 4, epsilon, n) ee = np.diag(h) hess = np.outer(h, h) for i in range(n): for j in range(i, n): hess[i, j] = (f(*((x + ee[i, :] + ee[j, :],) + args), **kwargs) - f(*((x + ee[i, :] - ee[j, :],) + args), **kwargs) - (f(*((x - ee[i, :] + ee[j, :],) + args), **kwargs) - f(*((x - ee[i, :] - ee[j, :],) + args), **kwargs)) )/(4.*hess[i, j]) hess[j, i] = hess[i, j] return hess approx_hess = approx_hess3 approx_hess.__doc__ += "\n This is an alias for approx_hess3"