""" Newton solver for Generalized Linear Models """ # Author: Christian Lorentzen # License: BSD 3 clause import warnings from abc import ABC, abstractmethod import numpy as np import scipy.linalg import scipy.optimize from ..._loss.loss import HalfSquaredError from ...exceptions import ConvergenceWarning from ...utils.optimize import _check_optimize_result from .._linear_loss import LinearModelLoss class NewtonSolver(ABC): """Newton solver for GLMs. This class implements Newton/2nd-order optimization routines for GLMs. Each Newton iteration aims at finding the Newton step which is done by the inner solver. With Hessian H, gradient g and coefficients coef, one step solves: H @ coef_newton = -g For our GLM / LinearModelLoss, we have gradient g and Hessian H: g = X.T @ loss.gradient + l2_reg_strength * coef H = X.T @ diag(loss.hessian) @ X + l2_reg_strength * identity Backtracking line search updates coef = coef_old + t * coef_newton for some t in (0, 1]. This is a base class, actual implementations (child classes) may deviate from the above pattern and use structure specific tricks. Usage pattern: - initialize solver: sol = NewtonSolver(...) - solve the problem: sol.solve(X, y, sample_weight) References ---------- - Jorge Nocedal, Stephen J. Wright. (2006) "Numerical Optimization" 2nd edition https://doi.org/10.1007/978-0-387-40065-5 - Stephen P. Boyd, Lieven Vandenberghe. (2004) "Convex Optimization." Cambridge University Press, 2004. https://web.stanford.edu/~boyd/cvxbook/bv_cvxbook.pdf Parameters ---------- coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,) Initial coefficients of a linear model. If shape (n_classes * n_dof,), the classes of one feature are contiguous, i.e. one reconstructs the 2d-array via coef.reshape((n_classes, -1), order="F"). linear_loss : LinearModelLoss The loss to be minimized. l2_reg_strength : float, default=0.0 L2 regularization strength. tol : float, default=1e-4 The optimization problem is solved when each of the following condition is fulfilled: 1. maximum |gradient| <= tol 2. Newton decrement d: 1/2 * d^2 <= tol max_iter : int, default=100 Maximum number of Newton steps allowed. n_threads : int, default=1 Number of OpenMP threads to use for the computation of the Hessian and gradient of the loss function. Attributes ---------- coef_old : ndarray of shape coef.shape Coefficient of previous iteration. coef_newton : ndarray of shape coef.shape Newton step. gradient : ndarray of shape coef.shape Gradient of the loss w.r.t. the coefficients. gradient_old : ndarray of shape coef.shape Gradient of previous iteration. loss_value : float Value of objective function = loss + penalty. loss_value_old : float Value of objective function of previous itertion. raw_prediction : ndarray of shape (n_samples,) or (n_samples, n_classes) converged : bool Indicator for convergence of the solver. iteration : int Number of Newton steps, i.e. calls to inner_solve use_fallback_lbfgs_solve : bool If set to True, the solver will resort to call LBFGS to finish the optimisation procedure in case of convergence issues. gradient_times_newton : float gradient @ coef_newton, set in inner_solve and used by line_search. If the Newton step is a descent direction, this is negative. """ def __init__( self, *, coef, linear_loss=LinearModelLoss(base_loss=HalfSquaredError(), fit_intercept=True), l2_reg_strength=0.0, tol=1e-4, max_iter=100, n_threads=1, verbose=0, ): self.coef = coef self.linear_loss = linear_loss self.l2_reg_strength = l2_reg_strength self.tol = tol self.max_iter = max_iter self.n_threads = n_threads self.verbose = verbose def setup(self, X, y, sample_weight): """Precomputations If None, initializes: - self.coef Sets: - self.raw_prediction - self.loss_value """ _, _, self.raw_prediction = self.linear_loss.weight_intercept_raw(self.coef, X) self.loss_value = self.linear_loss.loss( coef=self.coef, X=X, y=y, sample_weight=sample_weight, l2_reg_strength=self.l2_reg_strength, n_threads=self.n_threads, raw_prediction=self.raw_prediction, ) @abstractmethod def update_gradient_hessian(self, X, y, sample_weight): """Update gradient and Hessian.""" @abstractmethod def inner_solve(self, X, y, sample_weight): """Compute Newton step. Sets: - self.coef_newton - self.gradient_times_newton """ def fallback_lbfgs_solve(self, X, y, sample_weight): """Fallback solver in case of emergency. If a solver detects convergence problems, it may fall back to this methods in the hope to exit with success instead of raising an error. Sets: - self.coef - self.converged """ opt_res = scipy.optimize.minimize( self.linear_loss.loss_gradient, self.coef, method="L-BFGS-B", jac=True, options={ "maxiter": self.max_iter, "maxls": 50, # default is 20 "iprint": self.verbose - 1, "gtol": self.tol, "ftol": 64 * np.finfo(np.float64).eps, }, args=(X, y, sample_weight, self.l2_reg_strength, self.n_threads), ) self.n_iter_ = _check_optimize_result("lbfgs", opt_res) self.coef = opt_res.x self.converged = opt_res.status == 0 def line_search(self, X, y, sample_weight): """Backtracking line search. Sets: - self.coef_old - self.coef - self.loss_value_old - self.loss_value - self.gradient_old - self.gradient - self.raw_prediction """ # line search parameters beta, sigma = 0.5, 0.00048828125 # 1/2, 1/2**11 eps = 16 * np.finfo(self.loss_value.dtype).eps t = 1 # step size # gradient_times_newton = self.gradient @ self.coef_newton # was computed in inner_solve. armijo_term = sigma * self.gradient_times_newton _, _, raw_prediction_newton = self.linear_loss.weight_intercept_raw( self.coef_newton, X ) self.coef_old = self.coef self.loss_value_old = self.loss_value self.gradient_old = self.gradient # np.sum(np.abs(self.gradient_old)) sum_abs_grad_old = -1 is_verbose = self.verbose >= 2 if is_verbose: print(" Backtracking Line Search") print(f" eps=10 * finfo.eps={eps}") for i in range(21): # until and including t = beta**20 ~ 1e-6 self.coef = self.coef_old + t * self.coef_newton raw = self.raw_prediction + t * raw_prediction_newton self.loss_value, self.gradient = self.linear_loss.loss_gradient( coef=self.coef, X=X, y=y, sample_weight=sample_weight, l2_reg_strength=self.l2_reg_strength, n_threads=self.n_threads, raw_prediction=raw, ) # Note: If coef_newton is too large, loss_gradient may produce inf values, # potentially accompanied by a RuntimeWarning. # This case will be captured by the Armijo condition. # 1. Check Armijo / sufficient decrease condition. # The smaller (more negative) the better. loss_improvement = self.loss_value - self.loss_value_old check = loss_improvement <= t * armijo_term if is_verbose: print( f" line search iteration={i+1}, step size={t}\n" f" check loss improvement <= armijo term: {loss_improvement} " f"<= {t * armijo_term} {check}" ) if check: break # 2. Deal with relative loss differences around machine precision. tiny_loss = np.abs(self.loss_value_old * eps) check = np.abs(loss_improvement) <= tiny_loss if is_verbose: print( " check loss |improvement| <= eps * |loss_old|:" f" {np.abs(loss_improvement)} <= {tiny_loss} {check}" ) if check: if sum_abs_grad_old < 0: sum_abs_grad_old = scipy.linalg.norm(self.gradient_old, ord=1) # 2.1 Check sum of absolute gradients as alternative condition. sum_abs_grad = scipy.linalg.norm(self.gradient, ord=1) check = sum_abs_grad < sum_abs_grad_old if is_verbose: print( " check sum(|gradient|) < sum(|gradient_old|): " f"{sum_abs_grad} < {sum_abs_grad_old} {check}" ) if check: break t *= beta else: warnings.warn( ( f"Line search of Newton solver {self.__class__.__name__} at" f" iteration #{self.iteration} did no converge after 21 line search" " refinement iterations. It will now resort to lbfgs instead." ), ConvergenceWarning, ) if self.verbose: print(" Line search did not converge and resorts to lbfgs instead.") self.use_fallback_lbfgs_solve = True return self.raw_prediction = raw def check_convergence(self, X, y, sample_weight): """Check for convergence. Sets self.converged. """ if self.verbose: print(" Check Convergence") # Note: Checking maximum relative change of coefficient <= tol is a bad # convergence criterion because even a large step could have brought us close # to the true minimum. # coef_step = self.coef - self.coef_old # check = np.max(np.abs(coef_step) / np.maximum(1, np.abs(self.coef_old))) # 1. Criterion: maximum |gradient| <= tol # The gradient was already updated in line_search() check = np.max(np.abs(self.gradient)) if self.verbose: print(f" 1. max |gradient| {check} <= {self.tol}") if check > self.tol: return # 2. Criterion: For Newton decrement d, check 1/2 * d^2 <= tol # d = sqrt(grad @ hessian^-1 @ grad) # = sqrt(coef_newton @ hessian @ coef_newton) # See Boyd, Vanderberghe (2009) "Convex Optimization" Chapter 9.5.1. d2 = self.coef_newton @ self.hessian @ self.coef_newton if self.verbose: print(f" 2. Newton decrement {0.5 * d2} <= {self.tol}") if 0.5 * d2 > self.tol: return if self.verbose: loss_value = self.linear_loss.loss( coef=self.coef, X=X, y=y, sample_weight=sample_weight, l2_reg_strength=self.l2_reg_strength, n_threads=self.n_threads, ) print(f" Solver did converge at loss = {loss_value}.") self.converged = True def finalize(self, X, y, sample_weight): """Finalize the solvers results. Some solvers may need this, others not. """ pass def solve(self, X, y, sample_weight): """Solve the optimization problem. This is the main routine. Order of calls: self.setup() while iteration: self.update_gradient_hessian() self.inner_solve() self.line_search() self.check_convergence() self.finalize() Returns ------- coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,) Solution of the optimization problem. """ # setup usually: # - initializes self.coef if needed # - initializes and calculates self.raw_predictions, self.loss_value self.setup(X=X, y=y, sample_weight=sample_weight) self.iteration = 1 self.converged = False self.use_fallback_lbfgs_solve = False while self.iteration <= self.max_iter and not self.converged: if self.verbose: print(f"Newton iter={self.iteration}") self.use_fallback_lbfgs_solve = False # Fallback solver. # 1. Update Hessian and gradient self.update_gradient_hessian(X=X, y=y, sample_weight=sample_weight) # TODO: # if iteration == 1: # We might stop early, e.g. we already are close to the optimum, # usually detected by zero gradients at this stage. # 2. Inner solver # Calculate Newton step/direction # This usually sets self.coef_newton and self.gradient_times_newton. self.inner_solve(X=X, y=y, sample_weight=sample_weight) if self.use_fallback_lbfgs_solve: break # 3. Backtracking line search # This usually sets self.coef_old, self.coef, self.loss_value_old # self.loss_value, self.gradient_old, self.gradient, # self.raw_prediction. self.line_search(X=X, y=y, sample_weight=sample_weight) if self.use_fallback_lbfgs_solve: break # 4. Check convergence # Sets self.converged. self.check_convergence(X=X, y=y, sample_weight=sample_weight) # 5. Next iteration self.iteration += 1 if not self.converged: if self.use_fallback_lbfgs_solve: # Note: The fallback solver circumvents check_convergence and relies on # the convergence checks of lbfgs instead. Enough warnings have been # raised on the way. self.fallback_lbfgs_solve(X=X, y=y, sample_weight=sample_weight) else: warnings.warn( ( f"Newton solver did not converge after {self.iteration - 1} " "iterations." ), ConvergenceWarning, ) self.iteration -= 1 self.finalize(X=X, y=y, sample_weight=sample_weight) return self.coef class NewtonCholeskySolver(NewtonSolver): """Cholesky based Newton solver. Inner solver for finding the Newton step H w_newton = -g uses Cholesky based linear solver. """ def setup(self, X, y, sample_weight): super().setup(X=X, y=y, sample_weight=sample_weight) n_dof = X.shape[1] if self.linear_loss.fit_intercept: n_dof += 1 self.gradient = np.empty_like(self.coef) self.hessian = np.empty_like(self.coef, shape=(n_dof, n_dof)) def update_gradient_hessian(self, X, y, sample_weight): _, _, self.hessian_warning = self.linear_loss.gradient_hessian( coef=self.coef, X=X, y=y, sample_weight=sample_weight, l2_reg_strength=self.l2_reg_strength, n_threads=self.n_threads, gradient_out=self.gradient, hessian_out=self.hessian, raw_prediction=self.raw_prediction, # this was updated in line_search ) def inner_solve(self, X, y, sample_weight): if self.hessian_warning: warnings.warn( ( f"The inner solver of {self.__class__.__name__} detected a " "pointwise hessian with many negative values at iteration " f"#{self.iteration}. It will now resort to lbfgs instead." ), ConvergenceWarning, ) if self.verbose: print( " The inner solver detected a pointwise Hessian with many " "negative values and resorts to lbfgs instead." ) self.use_fallback_lbfgs_solve = True return try: with warnings.catch_warnings(): warnings.simplefilter("error", scipy.linalg.LinAlgWarning) self.coef_newton = scipy.linalg.solve( self.hessian, -self.gradient, check_finite=False, assume_a="sym" ) self.gradient_times_newton = self.gradient @ self.coef_newton if self.gradient_times_newton > 0: if self.verbose: print( " The inner solver found a Newton step that is not a " "descent direction and resorts to LBFGS steps instead." ) self.use_fallback_lbfgs_solve = True return except (np.linalg.LinAlgError, scipy.linalg.LinAlgWarning) as e: warnings.warn( f"The inner solver of {self.__class__.__name__} stumbled upon a " "singular or very ill-conditioned Hessian matrix at iteration " f"#{self.iteration}. It will now resort to lbfgs instead.\n" "Further options are to use another solver or to avoid such situation " "in the first place. Possible remedies are removing collinear features" " of X or increasing the penalization strengths.\n" "The original Linear Algebra message was:\n" + str(e), scipy.linalg.LinAlgWarning, ) # Possible causes: # 1. hess_pointwise is negative. But this is already taken care in # LinearModelLoss.gradient_hessian. # 2. X is singular or ill-conditioned # This might be the most probable cause. # # There are many possible ways to deal with this situation. Most of them # add, explicitly or implicitly, a matrix to the hessian to make it # positive definite, confer to Chapter 3.4 of Nocedal & Wright 2nd ed. # Instead, we resort to lbfgs. if self.verbose: print( " The inner solver stumbled upon an singular or ill-conditioned " "Hessian matrix and resorts to LBFGS instead." ) self.use_fallback_lbfgs_solve = True return