some new features

2025-07-30 17:09:11 +03:00
parent db5d46760a
commit 8019bd3b7c
20616 changed files with 4375466 additions and 8 deletions
--- a/.venv/lib/python3.12/site-packages/scipy/optimize/_linesearch.py
+++ b/.venv/lib/python3.12/site-packages/scipy/optimize/_linesearch.py
@ -0,0 +1,896 @@
+"""
+Functions
+---------
+.. autosummary::
+   :toctree: generated/
+
+    line_search_armijo
+    line_search_wolfe1
+    line_search_wolfe2
+    scalar_search_wolfe1
+    scalar_search_wolfe2
+
+"""
+from warnings import warn
+
+from ._dcsrch import DCSRCH
+import numpy as np
+
+__all__ = ['LineSearchWarning', 'line_search_wolfe1', 'line_search_wolfe2',
+           'scalar_search_wolfe1', 'scalar_search_wolfe2',
+           'line_search_armijo']
+
+class LineSearchWarning(RuntimeWarning):
+    pass
+
+
+def _check_c1_c2(c1, c2):
+    if not (0 < c1 < c2 < 1):
+        raise ValueError("'c1' and 'c2' do not satisfy"
+                         "'0 < c1 < c2 < 1'.")
+
+
+#------------------------------------------------------------------------------
+# Minpack's Wolfe line and scalar searches
+#------------------------------------------------------------------------------
+
+def line_search_wolfe1(f, fprime, xk, pk, gfk=None,
+                       old_fval=None, old_old_fval=None,
+                       args=(), c1=1e-4, c2=0.9, amax=50, amin=1e-8,
+                       xtol=1e-14):
+    """
+    As `scalar_search_wolfe1` but do a line search to direction `pk`
+
+    Parameters
+    ----------
+    f : callable
+        Function `f(x)`
+    fprime : callable
+        Gradient of `f`
+    xk : array_like
+        Current point
+    pk : array_like
+        Search direction
+    gfk : array_like, optional
+        Gradient of `f` at point `xk`
+    old_fval : float, optional
+        Value of `f` at point `xk`
+    old_old_fval : float, optional
+        Value of `f` at point preceding `xk`
+
+    The rest of the parameters are the same as for `scalar_search_wolfe1`.
+
+    Returns
+    -------
+    stp, f_count, g_count, fval, old_fval
+        As in `line_search_wolfe1`
+    gval : array
+        Gradient of `f` at the final point
+
+    Notes
+    -----
+    Parameters `c1` and `c2` must satisfy ``0 < c1 < c2 < 1``.
+
+    """
+    if gfk is None:
+        gfk = fprime(xk, *args)
+
+    gval = [gfk]
+    gc = [0]
+    fc = [0]
+
+    def phi(s):
+        fc[0] += 1
+        return f(xk + s*pk, *args)
+
+    def derphi(s):
+        gval[0] = fprime(xk + s*pk, *args)
+        gc[0] += 1
+        return np.dot(gval[0], pk)
+
+    derphi0 = np.dot(gfk, pk)
+
+    stp, fval, old_fval = scalar_search_wolfe1(
+            phi, derphi, old_fval, old_old_fval, derphi0,
+            c1=c1, c2=c2, amax=amax, amin=amin, xtol=xtol)
+
+    return stp, fc[0], gc[0], fval, old_fval, gval[0]
+
+
+def scalar_search_wolfe1(phi, derphi, phi0=None, old_phi0=None, derphi0=None,
+                         c1=1e-4, c2=0.9,
+                         amax=50, amin=1e-8, xtol=1e-14):
+    """
+    Scalar function search for alpha that satisfies strong Wolfe conditions
+
+    alpha > 0 is assumed to be a descent direction.
+
+    Parameters
+    ----------
+    phi : callable phi(alpha)
+        Function at point `alpha`
+    derphi : callable phi'(alpha)
+        Objective function derivative. Returns a scalar.
+    phi0 : float, optional
+        Value of phi at 0
+    old_phi0 : float, optional
+        Value of phi at previous point
+    derphi0 : float, optional
+        Value derphi at 0
+    c1 : float, optional
+        Parameter for Armijo condition rule.
+    c2 : float, optional
+        Parameter for curvature condition rule.
+    amax, amin : float, optional
+        Maximum and minimum step size
+    xtol : float, optional
+        Relative tolerance for an acceptable step.
+
+    Returns
+    -------
+    alpha : float
+        Step size, or None if no suitable step was found
+    phi : float
+        Value of `phi` at the new point `alpha`
+    phi0 : float
+        Value of `phi` at `alpha=0`
+
+    Notes
+    -----
+    Uses routine DCSRCH from MINPACK.
+    
+    Parameters `c1` and `c2` must satisfy ``0 < c1 < c2 < 1`` as described in [1]_.
+
+    References
+    ----------
+    
+    .. [1] Nocedal, J., & Wright, S. J. (2006). Numerical optimization.
+       In Springer Series in Operations Research and Financial Engineering.
+       (Springer Series in Operations Research and Financial Engineering).
+       Springer Nature.
+
+    """
+    _check_c1_c2(c1, c2)
+
+    if phi0 is None:
+        phi0 = phi(0.)
+    if derphi0 is None:
+        derphi0 = derphi(0.)
+
+    if old_phi0 is not None and derphi0 != 0:
+        alpha1 = min(1.0, 1.01*2*(phi0 - old_phi0)/derphi0)
+        if alpha1 < 0:
+            alpha1 = 1.0
+    else:
+        alpha1 = 1.0
+
+    maxiter = 100
+
+    dcsrch = DCSRCH(phi, derphi, c1, c2, xtol, amin, amax)
+    stp, phi1, phi0, task = dcsrch(
+        alpha1, phi0=phi0, derphi0=derphi0, maxiter=maxiter
+    )
+
+    return stp, phi1, phi0
+
+
+line_search = line_search_wolfe1
+
+
+#------------------------------------------------------------------------------
+# Pure-Python Wolfe line and scalar searches
+#------------------------------------------------------------------------------
+
+# Note: `line_search_wolfe2` is the public `scipy.optimize.line_search`
+
+def line_search_wolfe2(f, myfprime, xk, pk, gfk=None, old_fval=None,
+                       old_old_fval=None, args=(), c1=1e-4, c2=0.9, amax=None,
+                       extra_condition=None, maxiter=10):
+    """Find alpha that satisfies strong Wolfe conditions.
+
+    Parameters
+    ----------
+    f : callable f(x,*args)
+        Objective function.
+    myfprime : callable f'(x,*args)
+        Objective function gradient.
+    xk : ndarray
+        Starting point.
+    pk : ndarray
+        Search direction. The search direction must be a descent direction
+        for the algorithm to converge.
+    gfk : ndarray, optional
+        Gradient value for x=xk (xk being the current parameter
+        estimate). Will be recomputed if omitted.
+    old_fval : float, optional
+        Function value for x=xk. Will be recomputed if omitted.
+    old_old_fval : float, optional
+        Function value for the point preceding x=xk.
+    args : tuple, optional
+        Additional arguments passed to objective function.
+    c1 : float, optional
+        Parameter for Armijo condition rule.
+    c2 : float, optional
+        Parameter for curvature condition rule.
+    amax : float, optional
+        Maximum step size
+    extra_condition : callable, optional
+        A callable of the form ``extra_condition(alpha, x, f, g)``
+        returning a boolean. Arguments are the proposed step ``alpha``
+        and the corresponding ``x``, ``f`` and ``g`` values. The line search
+        accepts the value of ``alpha`` only if this
+        callable returns ``True``. If the callable returns ``False``
+        for the step length, the algorithm will continue with
+        new iterates. The callable is only called for iterates
+        satisfying the strong Wolfe conditions.
+    maxiter : int, optional
+        Maximum number of iterations to perform.
+
+    Returns
+    -------
+    alpha : float or None
+        Alpha for which ``x_new = x0 + alpha * pk``,
+        or None if the line search algorithm did not converge.
+    fc : int
+        Number of function evaluations made.
+    gc : int
+        Number of gradient evaluations made.
+    new_fval : float or None
+        New function value ``f(x_new)=f(x0+alpha*pk)``,
+        or None if the line search algorithm did not converge.
+    old_fval : float
+        Old function value ``f(x0)``.
+    new_slope : float or None
+        The local slope along the search direction at the
+        new value ``<myfprime(x_new), pk>``,
+        or None if the line search algorithm did not converge.
+
+
+    Notes
+    -----
+    Uses the line search algorithm to enforce strong Wolfe
+    conditions. See Wright and Nocedal, 'Numerical Optimization',
+    1999, pp. 59-61.
+
+    The search direction `pk` must be a descent direction (e.g.
+    ``-myfprime(xk)``) to find a step length that satisfies the strong Wolfe
+    conditions. If the search direction is not a descent direction (e.g.
+    ``myfprime(xk)``), then `alpha`, `new_fval`, and `new_slope` will be None.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from scipy.optimize import line_search
+
+    A objective function and its gradient are defined.
+
+    >>> def obj_func(x):
+    ...     return (x[0])**2+(x[1])**2
+    >>> def obj_grad(x):
+    ...     return [2*x[0], 2*x[1]]
+
+    We can find alpha that satisfies strong Wolfe conditions.
+
+    >>> start_point = np.array([1.8, 1.7])
+    >>> search_gradient = np.array([-1.0, -1.0])
+    >>> line_search(obj_func, obj_grad, start_point, search_gradient)
+    (1.0, 2, 1, 1.1300000000000001, 6.13, [1.6, 1.4])
+
+    """
+    fc = [0]
+    gc = [0]
+    gval = [None]
+    gval_alpha = [None]
+
+    def phi(alpha):
+        fc[0] += 1
+        return f(xk + alpha * pk, *args)
+
+    fprime = myfprime
+
+    def derphi(alpha):
+        gc[0] += 1
+        gval[0] = fprime(xk + alpha * pk, *args)  # store for later use
+        gval_alpha[0] = alpha
+        return np.dot(gval[0], pk)
+
+    if gfk is None:
+        gfk = fprime(xk, *args)
+    derphi0 = np.dot(gfk, pk)
+
+    if extra_condition is not None:
+        # Add the current gradient as argument, to avoid needless
+        # re-evaluation
+        def extra_condition2(alpha, phi):
+            if gval_alpha[0] != alpha:
+                derphi(alpha)
+            x = xk + alpha * pk
+            return extra_condition(alpha, x, phi, gval[0])
+    else:
+        extra_condition2 = None
+
+    alpha_star, phi_star, old_fval, derphi_star = scalar_search_wolfe2(
+            phi, derphi, old_fval, old_old_fval, derphi0, c1, c2, amax,
+            extra_condition2, maxiter=maxiter)
+
+    if derphi_star is None:
+        warn('The line search algorithm did not converge',
+             LineSearchWarning, stacklevel=2)
+    else:
+        # derphi_star is a number (derphi) -- so use the most recently
+        # calculated gradient used in computing it derphi = gfk*pk
+        # this is the gradient at the next step no need to compute it
+        # again in the outer loop.
+        derphi_star = gval[0]
+
+    return alpha_star, fc[0], gc[0], phi_star, old_fval, derphi_star
+
+
+def scalar_search_wolfe2(phi, derphi, phi0=None,
+                         old_phi0=None, derphi0=None,
+                         c1=1e-4, c2=0.9, amax=None,
+                         extra_condition=None, maxiter=10):
+    """Find alpha that satisfies strong Wolfe conditions.
+
+    alpha > 0 is assumed to be a descent direction.
+
+    Parameters
+    ----------
+    phi : callable phi(alpha)
+        Objective scalar function.
+    derphi : callable phi'(alpha)
+        Objective function derivative. Returns a scalar.
+    phi0 : float, optional
+        Value of phi at 0.
+    old_phi0 : float, optional
+        Value of phi at previous point.
+    derphi0 : float, optional
+        Value of derphi at 0
+    c1 : float, optional
+        Parameter for Armijo condition rule.
+    c2 : float, optional
+        Parameter for curvature condition rule.
+    amax : float, optional
+        Maximum step size.
+    extra_condition : callable, optional
+        A callable of the form ``extra_condition(alpha, phi_value)``
+        returning a boolean. The line search accepts the value
+        of ``alpha`` only if this callable returns ``True``.
+        If the callable returns ``False`` for the step length,
+        the algorithm will continue with new iterates.
+        The callable is only called for iterates satisfying
+        the strong Wolfe conditions.
+    maxiter : int, optional
+        Maximum number of iterations to perform.
+
+    Returns
+    -------
+    alpha_star : float or None
+        Best alpha, or None if the line search algorithm did not converge.
+    phi_star : float
+        phi at alpha_star.
+    phi0 : float
+        phi at 0.
+    derphi_star : float or None
+        derphi at alpha_star, or None if the line search algorithm
+        did not converge.
+
+    Notes
+    -----
+    Uses the line search algorithm to enforce strong Wolfe
+    conditions. See Wright and Nocedal, 'Numerical Optimization',
+    1999, pp. 59-61.
+
+    """
+    _check_c1_c2(c1, c2)
+
+    if phi0 is None:
+        phi0 = phi(0.)
+
+    if derphi0 is None:
+        derphi0 = derphi(0.)
+
+    alpha0 = 0
+    if old_phi0 is not None and derphi0 != 0:
+        alpha1 = min(1.0, 1.01*2*(phi0 - old_phi0)/derphi0)
+    else:
+        alpha1 = 1.0
+
+    if alpha1 < 0:
+        alpha1 = 1.0
+
+    if amax is not None:
+        alpha1 = min(alpha1, amax)
+
+    phi_a1 = phi(alpha1)
+    #derphi_a1 = derphi(alpha1) evaluated below
+
+    phi_a0 = phi0
+    derphi_a0 = derphi0
+
+    if extra_condition is None:
+        def extra_condition(alpha, phi):
+            return True
+
+    for i in range(maxiter):
+        if alpha1 == 0 or (amax is not None and alpha0 > amax):
+            # alpha1 == 0: This shouldn't happen. Perhaps the increment has
+            # slipped below machine precision?
+            alpha_star = None
+            phi_star = phi0
+            phi0 = old_phi0
+            derphi_star = None
+
+            if alpha1 == 0:
+                msg = 'Rounding errors prevent the line search from converging'
+            else:
+                msg = "The line search algorithm could not find a solution " + \
+                      "less than or equal to amax: %s" % amax
+
+            warn(msg, LineSearchWarning, stacklevel=2)
+            break
+
+        not_first_iteration = i > 0
+        if (phi_a1 > phi0 + c1 * alpha1 * derphi0) or \
+           ((phi_a1 >= phi_a0) and not_first_iteration):
+            alpha_star, phi_star, derphi_star = \
+                        _zoom(alpha0, alpha1, phi_a0,
+                              phi_a1, derphi_a0, phi, derphi,
+                              phi0, derphi0, c1, c2, extra_condition)
+            break
+
+        derphi_a1 = derphi(alpha1)
+        if (abs(derphi_a1) <= -c2*derphi0):
+            if extra_condition(alpha1, phi_a1):
+                alpha_star = alpha1
+                phi_star = phi_a1
+                derphi_star = derphi_a1
+                break
+
+        if (derphi_a1 >= 0):
+            alpha_star, phi_star, derphi_star = \
+                        _zoom(alpha1, alpha0, phi_a1,
+                              phi_a0, derphi_a1, phi, derphi,
+                              phi0, derphi0, c1, c2, extra_condition)
+            break
+
+        alpha2 = 2 * alpha1  # increase by factor of two on each iteration
+        if amax is not None:
+            alpha2 = min(alpha2, amax)
+        alpha0 = alpha1
+        alpha1 = alpha2
+        phi_a0 = phi_a1
+        phi_a1 = phi(alpha1)
+        derphi_a0 = derphi_a1
+
+    else:
+        # stopping test maxiter reached
+        alpha_star = alpha1
+        phi_star = phi_a1
+        derphi_star = None
+        warn('The line search algorithm did not converge',
+             LineSearchWarning, stacklevel=2)
+
+    return alpha_star, phi_star, phi0, derphi_star
+
+
+def _cubicmin(a, fa, fpa, b, fb, c, fc):
+    """
+    Finds the minimizer for a cubic polynomial that goes through the
+    points (a,fa), (b,fb), and (c,fc) with derivative at a of fpa.
+
+    If no minimizer can be found, return None.
+
+    """
+    # f(x) = A *(x-a)^3 + B*(x-a)^2 + C*(x-a) + D
+
+    with np.errstate(divide='raise', over='raise', invalid='raise'):
+        try:
+            C = fpa
+            db = b - a
+            dc = c - a
+            denom = (db * dc) ** 2 * (db - dc)
+            d1 = np.empty((2, 2))
+            d1[0, 0] = dc ** 2
+            d1[0, 1] = -db ** 2
+            d1[1, 0] = -dc ** 3
+            d1[1, 1] = db ** 3
+            [A, B] = np.dot(d1, np.asarray([fb - fa - C * db,
+                                            fc - fa - C * dc]).flatten())
+            A /= denom
+            B /= denom
+            radical = B * B - 3 * A * C
+            xmin = a + (-B + np.sqrt(radical)) / (3 * A)
+        except ArithmeticError:
+            return None
+    if not np.isfinite(xmin):
+        return None
+    return xmin
+
+
+def _quadmin(a, fa, fpa, b, fb):
+    """
+    Finds the minimizer for a quadratic polynomial that goes through
+    the points (a,fa), (b,fb) with derivative at a of fpa.
+
+    """
+    # f(x) = B*(x-a)^2 + C*(x-a) + D
+    with np.errstate(divide='raise', over='raise', invalid='raise'):
+        try:
+            D = fa
+            C = fpa
+            db = b - a * 1.0
+            B = (fb - D - C * db) / (db * db)
+            xmin = a - C / (2.0 * B)
+        except ArithmeticError:
+            return None
+    if not np.isfinite(xmin):
+        return None
+    return xmin
+
+
+def _zoom(a_lo, a_hi, phi_lo, phi_hi, derphi_lo,
+          phi, derphi, phi0, derphi0, c1, c2, extra_condition):
+    """Zoom stage of approximate linesearch satisfying strong Wolfe conditions.
+
+    Part of the optimization algorithm in `scalar_search_wolfe2`.
+
+    Notes
+    -----
+    Implements Algorithm 3.6 (zoom) in Wright and Nocedal,
+    'Numerical Optimization', 1999, pp. 61.
+
+    """
+
+    maxiter = 10
+    i = 0
+    delta1 = 0.2  # cubic interpolant check
+    delta2 = 0.1  # quadratic interpolant check
+    phi_rec = phi0
+    a_rec = 0
+    while True:
+        # interpolate to find a trial step length between a_lo and
+        # a_hi Need to choose interpolation here. Use cubic
+        # interpolation and then if the result is within delta *
+        # dalpha or outside of the interval bounded by a_lo or a_hi
+        # then use quadratic interpolation, if the result is still too
+        # close, then use bisection
+
+        dalpha = a_hi - a_lo
+        if dalpha < 0:
+            a, b = a_hi, a_lo
+        else:
+            a, b = a_lo, a_hi
+
+        # minimizer of cubic interpolant
+        # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi)
+        #
+        # if the result is too close to the end points (or out of the
+        # interval), then use quadratic interpolation with phi_lo,
+        # derphi_lo and phi_hi if the result is still too close to the
+        # end points (or out of the interval) then use bisection
+
+        if (i > 0):
+            cchk = delta1 * dalpha
+            a_j = _cubicmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi,
+                            a_rec, phi_rec)
+        if (i == 0) or (a_j is None) or (a_j > b - cchk) or (a_j < a + cchk):
+            qchk = delta2 * dalpha
+            a_j = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi)
+            if (a_j is None) or (a_j > b-qchk) or (a_j < a+qchk):
+                a_j = a_lo + 0.5*dalpha
+
+        # Check new value of a_j
+
+        phi_aj = phi(a_j)
+        if (phi_aj > phi0 + c1*a_j*derphi0) or (phi_aj >= phi_lo):
+            phi_rec = phi_hi
+            a_rec = a_hi
+            a_hi = a_j
+            phi_hi = phi_aj
+        else:
+            derphi_aj = derphi(a_j)
+            if abs(derphi_aj) <= -c2*derphi0 and extra_condition(a_j, phi_aj):
+                a_star = a_j
+                val_star = phi_aj
+                valprime_star = derphi_aj
+                break
+            if derphi_aj*(a_hi - a_lo) >= 0:
+                phi_rec = phi_hi
+                a_rec = a_hi
+                a_hi = a_lo
+                phi_hi = phi_lo
+            else:
+                phi_rec = phi_lo
+                a_rec = a_lo
+            a_lo = a_j
+            phi_lo = phi_aj
+            derphi_lo = derphi_aj
+        i += 1
+        if (i > maxiter):
+            # Failed to find a conforming step size
+            a_star = None
+            val_star = None
+            valprime_star = None
+            break
+    return a_star, val_star, valprime_star
+
+
+#------------------------------------------------------------------------------
+# Armijo line and scalar searches
+#------------------------------------------------------------------------------
+
+def line_search_armijo(f, xk, pk, gfk, old_fval, args=(), c1=1e-4, alpha0=1):
+    """Minimize over alpha, the function ``f(xk+alpha pk)``.
+
+    Parameters
+    ----------
+    f : callable
+        Function to be minimized.
+    xk : array_like
+        Current point.
+    pk : array_like
+        Search direction.
+    gfk : array_like
+        Gradient of `f` at point `xk`.
+    old_fval : float
+        Value of `f` at point `xk`.
+    args : tuple, optional
+        Optional arguments.
+    c1 : float, optional
+        Value to control stopping criterion.
+    alpha0 : scalar, optional
+        Value of `alpha` at start of the optimization.
+
+    Returns
+    -------
+    alpha
+    f_count
+    f_val_at_alpha
+
+    Notes
+    -----
+    Uses the interpolation algorithm (Armijo backtracking) as suggested by
+    Wright and Nocedal in 'Numerical Optimization', 1999, pp. 56-57
+
+    """
+    xk = np.atleast_1d(xk)
+    fc = [0]
+
+    def phi(alpha1):
+        fc[0] += 1
+        return f(xk + alpha1*pk, *args)
+
+    if old_fval is None:
+        phi0 = phi(0.)
+    else:
+        phi0 = old_fval  # compute f(xk) -- done in past loop
+
+    derphi0 = np.dot(gfk, pk)
+    alpha, phi1 = scalar_search_armijo(phi, phi0, derphi0, c1=c1,
+                                       alpha0=alpha0)
+    return alpha, fc[0], phi1
+
+
+def line_search_BFGS(f, xk, pk, gfk, old_fval, args=(), c1=1e-4, alpha0=1):
+    """
+    Compatibility wrapper for `line_search_armijo`
+    """
+    r = line_search_armijo(f, xk, pk, gfk, old_fval, args=args, c1=c1,
+                           alpha0=alpha0)
+    return r[0], r[1], 0, r[2]
+
+
+def scalar_search_armijo(phi, phi0, derphi0, c1=1e-4, alpha0=1, amin=0):
+    """Minimize over alpha, the function ``phi(alpha)``.
+
+    Uses the interpolation algorithm (Armijo backtracking) as suggested by
+    Wright and Nocedal in 'Numerical Optimization', 1999, pp. 56-57
+
+    alpha > 0 is assumed to be a descent direction.
+
+    Returns
+    -------
+    alpha
+    phi1
+
+    """
+    phi_a0 = phi(alpha0)
+    if phi_a0 <= phi0 + c1*alpha0*derphi0:
+        return alpha0, phi_a0
+
+    # Otherwise, compute the minimizer of a quadratic interpolant:
+
+    alpha1 = -(derphi0) * alpha0**2 / 2.0 / (phi_a0 - phi0 - derphi0 * alpha0)
+    phi_a1 = phi(alpha1)
+
+    if (phi_a1 <= phi0 + c1*alpha1*derphi0):
+        return alpha1, phi_a1
+
+    # Otherwise, loop with cubic interpolation until we find an alpha which
+    # satisfies the first Wolfe condition (since we are backtracking, we will
+    # assume that the value of alpha is not too small and satisfies the second
+    # condition.
+
+    while alpha1 > amin:       # we are assuming alpha>0 is a descent direction
+        factor = alpha0**2 * alpha1**2 * (alpha1-alpha0)
+        a = alpha0**2 * (phi_a1 - phi0 - derphi0*alpha1) - \
+            alpha1**2 * (phi_a0 - phi0 - derphi0*alpha0)
+        a = a / factor
+        b = -alpha0**3 * (phi_a1 - phi0 - derphi0*alpha1) + \
+            alpha1**3 * (phi_a0 - phi0 - derphi0*alpha0)
+        b = b / factor
+
+        alpha2 = (-b + np.sqrt(abs(b**2 - 3 * a * derphi0))) / (3.0*a)
+        phi_a2 = phi(alpha2)
+
+        if (phi_a2 <= phi0 + c1*alpha2*derphi0):
+            return alpha2, phi_a2
+
+        if (alpha1 - alpha2) > alpha1 / 2.0 or (1 - alpha2/alpha1) < 0.96:
+            alpha2 = alpha1 / 2.0
+
+        alpha0 = alpha1
+        alpha1 = alpha2
+        phi_a0 = phi_a1
+        phi_a1 = phi_a2
+
+    # Failed to find a suitable step length
+    return None, phi_a1
+
+
+#------------------------------------------------------------------------------
+# Non-monotone line search for DF-SANE
+#------------------------------------------------------------------------------
+
+def _nonmonotone_line_search_cruz(f, x_k, d, prev_fs, eta,
+                                  gamma=1e-4, tau_min=0.1, tau_max=0.5):
+    """
+    Nonmonotone backtracking line search as described in [1]_
+
+    Parameters
+    ----------
+    f : callable
+        Function returning a tuple ``(f, F)`` where ``f`` is the value
+        of a merit function and ``F`` the residual.
+    x_k : ndarray
+        Initial position.
+    d : ndarray
+        Search direction.
+    prev_fs : float
+        List of previous merit function values. Should have ``len(prev_fs) <= M``
+        where ``M`` is the nonmonotonicity window parameter.
+    eta : float
+        Allowed merit function increase, see [1]_
+    gamma, tau_min, tau_max : float, optional
+        Search parameters, see [1]_
+
+    Returns
+    -------
+    alpha : float
+        Step length
+    xp : ndarray
+        Next position
+    fp : float
+        Merit function value at next position
+    Fp : ndarray
+        Residual at next position
+
+    References
+    ----------
+    [1] "Spectral residual method without gradient information for solving
+        large-scale nonlinear systems of equations." W. La Cruz,
+        J.M. Martinez, M. Raydan. Math. Comp. **75**, 1429 (2006).
+
+    """
+    f_k = prev_fs[-1]
+    f_bar = max(prev_fs)
+
+    alpha_p = 1
+    alpha_m = 1
+    alpha = 1
+
+    while True:
+        xp = x_k + alpha_p * d
+        fp, Fp = f(xp)
+
+        if fp <= f_bar + eta - gamma * alpha_p**2 * f_k:
+            alpha = alpha_p
+            break
+
+        alpha_tp = alpha_p**2 * f_k / (fp + (2*alpha_p - 1)*f_k)
+
+        xp = x_k - alpha_m * d
+        fp, Fp = f(xp)
+
+        if fp <= f_bar + eta - gamma * alpha_m**2 * f_k:
+            alpha = -alpha_m
+            break
+
+        alpha_tm = alpha_m**2 * f_k / (fp + (2*alpha_m - 1)*f_k)
+
+        alpha_p = np.clip(alpha_tp, tau_min * alpha_p, tau_max * alpha_p)
+        alpha_m = np.clip(alpha_tm, tau_min * alpha_m, tau_max * alpha_m)
+
+    return alpha, xp, fp, Fp
+
+
+def _nonmonotone_line_search_cheng(f, x_k, d, f_k, C, Q, eta,
+                                   gamma=1e-4, tau_min=0.1, tau_max=0.5,
+                                   nu=0.85):
+    """
+    Nonmonotone line search from [1]
+
+    Parameters
+    ----------
+    f : callable
+        Function returning a tuple ``(f, F)`` where ``f`` is the value
+        of a merit function and ``F`` the residual.
+    x_k : ndarray
+        Initial position.
+    d : ndarray
+        Search direction.
+    f_k : float
+        Initial merit function value.
+    C, Q : float
+        Control parameters. On the first iteration, give values
+        Q=1.0, C=f_k
+    eta : float
+        Allowed merit function increase, see [1]_
+    nu, gamma, tau_min, tau_max : float, optional
+        Search parameters, see [1]_
+
+    Returns
+    -------
+    alpha : float
+        Step length
+    xp : ndarray
+        Next position
+    fp : float
+        Merit function value at next position
+    Fp : ndarray
+        Residual at next position
+    C : float
+        New value for the control parameter C
+    Q : float
+        New value for the control parameter Q
+
+    References
+    ----------
+    .. [1] W. Cheng & D.-H. Li, ''A derivative-free nonmonotone line
+           search and its application to the spectral residual
+           method'', IMA J. Numer. Anal. 29, 814 (2009).
+
+    """
+    alpha_p = 1
+    alpha_m = 1
+    alpha = 1
+
+    while True:
+        xp = x_k + alpha_p * d
+        fp, Fp = f(xp)
+
+        if fp <= C + eta - gamma * alpha_p**2 * f_k:
+            alpha = alpha_p
+            break
+
+        alpha_tp = alpha_p**2 * f_k / (fp + (2*alpha_p - 1)*f_k)
+
+        xp = x_k - alpha_m * d
+        fp, Fp = f(xp)
+
+        if fp <= C + eta - gamma * alpha_m**2 * f_k:
+            alpha = -alpha_m
+            break
+
+        alpha_tm = alpha_m**2 * f_k / (fp + (2*alpha_m - 1)*f_k)
+
+        alpha_p = np.clip(alpha_tp, tau_min * alpha_p, tau_max * alpha_p)
+        alpha_m = np.clip(alpha_tm, tau_min * alpha_m, tau_max * alpha_m)
+
+    # Update C and Q
+    Q_next = nu * Q + 1
+    C = (nu * Q * (C + eta) + fp) / Q_next
+    Q = Q_next
+
+    return alpha, xp, fp, Fp, C, Q