Source code for skfolio.utils.stats

"""Tools module."""

import warnings

# Copyright (c) 2023
# Author: Hugo Delatte <delatte.hugo@gmail.com>
# SPDX-License-Identifier: BSD-3-Clause
# Implementation derived from:
# Riskfolio-Lib, Copyright (c) 2020-2023, Dany Cajas, Licensed under BSD 3 clause.
# Statsmodels, Copyright (C) 2006, Jonathan E. Taylor, Licensed under BSD 3 clause.
from enum import auto

import cvxpy as cp
import numpy as np
import scipy.cluster.hierarchy as sch
import scipy.optimize as sco
import scipy.sparse.linalg as scl
import scipy.spatial.distance as scd
import scipy.special as scs
from scipy.sparse import csr_matrix

from skfolio.utils.tools import AutoEnum

__all__ = [
    "NBinsMethod",
    "assert_is_distance",
    "assert_is_square",
    "assert_is_symmetric",
    "commutation_matrix",
    "compute_optimal_n_clusters",
    "corr_to_cov",
    "cov_nearest",
    "cov_to_corr",
    "is_cholesky_dec",
    "minimize_relative_weight_deviation",
    "n_bins_freedman",
    "n_bins_knuth",
    "rand_weights",
    "rand_weights_dirichlet",
]



[docs]
class NBinsMethod(AutoEnum):
    """Enumeration of the Number of Bins Methods.

    Parameters
    ----------
    FREEDMAN : str
        Freedman method

    KNUTH : str
        Knuth method
    """

    FREEDMAN = auto()
    KNUTH = auto()




[docs]
def n_bins_freedman(x: np.ndarray) -> int:
    """Compute the optimal histogram bin size using the Freedman-Diaconis rule [1]_.

    Parameters
    ----------
    x : ndarray of shape (n_observations,)
        The input array.

    Returns
    -------
    n_bins : int
        The optimal bin size.

    References
    ----------
    .. [1] "On the histogram as a density estimator: L2 theory".
        Freedman & Diaconis (1981).
    """
    if x.ndim != 1:
        raise ValueError("`x` must be a 1d-array")
    n = len(x)
    p_25, p_75 = np.percentile(x, [25, 75])
    d = 2 * (p_75 - p_25) / (n ** (1 / 3))
    if d == 0:
        return 5
    n_bins = max(1, np.ceil((np.max(x) - np.min(x)) / d))
    return round(n_bins)




[docs]
def n_bins_knuth(x: np.ndarray) -> int:
    """Compute the optimal histogram bin size using Knuth's rule [1]_.

    Parameters
    ----------
    x : ndarray of shape (n_observations,)
        The input array.

    Returns
    -------
    n_bins : int
        The optimal bin size.

    References
    ----------
    .. [1] "Optimal Data-Based Binning for Histograms".
        Knuth.
    """
    x = np.sort(x)
    n = len(x)

    def func(y: np.ndarray) -> float:
        y = y[0]
        if y <= 0:
            return np.inf
        bin_edges = np.linspace(x[0], x[-1], int(y) + 1)
        hist, _ = np.histogram(x, bin_edges)
        return -(
            n * np.log(y)
            + scs.gammaln(0.5 * y)
            - y * scs.gammaln(0.5)
            - scs.gammaln(n + 0.5 * y)
            + np.sum(scs.gammaln(hist + 0.5))
        )

    n_bins_init = n_bins_freedman(x)
    n_bins = sco.fmin(func, n_bins_init, disp=0)[0]
    return round(n_bins)




[docs]
def rand_weights_dirichlet(n: int) -> np.array:
    """Produces n random weights that sum to one from a dirichlet distribution
    (uniform distribution over a simplex).

    Parameters
    ----------
    n : int
        Number of weights.

    Returns
    -------
    weights : ndarray of shape (n, )
        The vector of weights.
    """
    return np.random.dirichlet(np.ones(n))




[docs]
def rand_weights(n: int, zeros: int = 0) -> np.array:
    """Produces n random weights that sum to one from an uniform distribution
    (non-uniform distribution over a simplex).

    Parameters
    ----------
    n : int
        Number of weights.

    zeros : int, default=0
        The number of weights to randomly set to zeros.

    Returns
    -------
    weights : ndarray of shape (n, )
        The vector of weights.
    """
    k = np.random.rand(n)
    if zeros > 0:
        zeros_idx = np.random.choice(n, zeros, replace=False)
        k[zeros_idx] = 0
    return k / sum(k)




[docs]
def is_cholesky_dec(x: np.ndarray) -> bool:
    """Returns True if Cholesky decomposition can be computed.
    The matrix must be Hermitian (symmetric if real-valued) and positive-definite.
    No checking is performed to verify whether the matrix is Hermitian or not.

    Parameters
    ----------
    x : ndarray of shape (n, m)
       The matrix.

    Returns
    -------
    value : bool
        True if Cholesky decomposition can be applied to the matrix, False otherwise.
    """
    # Around 100 times faster than checking for positive eigenvalues with np.linalg.eigh
    try:
        np.linalg.cholesky(x)
        return True
    except np.linalg.LinAlgError:
        return False



def is_positive_definite(x: np.ndarray) -> bool:
    """Returns True if the matrix is positive definite.

    Parameters
    ----------
    x : ndarray of shape (n, m)
       The matrix.

    Returns
    -------
    value : bool
        True if the matrix is positive definite, False otherwise.
    """
    return np.all(np.linalg.eigvals(x) > 0)



[docs]
def assert_is_square(x: np.ndarray) -> None:
    """Raises an error if the matrix is not square.

    Parameters
    ----------
    x : ndarray of shape (n, n)
       The matrix.

    Raises
    ------
    ValueError: if the matrix is not square.
    """
    if x.ndim != 2 or x.shape[0] != x.shape[1]:
        raise ValueError("The matrix must be square")




[docs]
def assert_is_symmetric(x: np.ndarray) -> None:
    """Raises an error if the matrix is not symmetric.

    Parameters
    ----------
    x : ndarray of shape (n, m)
       The matrix.

    Raises
    ------
    ValueError: if the matrix is not symmetric.
    """
    assert_is_square(x)
    if not np.allclose(x, x.T):
        raise ValueError("The matrix must be symmetric")




[docs]
def assert_is_distance(x: np.ndarray) -> None:
    """Raises an error if the matrix is not a distance matrix.

    Parameters
    ----------
    x : ndarray of shape (n, n)
       The matrix.

    Raises
    ------
    ValueError: if the matrix is a distance matrix.
    """
    assert_is_symmetric(x)
    if not np.allclose(np.diag(x), np.zeros(x.shape[0]), atol=1e-5):
        raise ValueError(
            "The distance matrix must have diagonal elements close to zeros"
        )




[docs]
def cov_to_corr(cov: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    """Convert a covariance matrix to a correlation matrix.

    Parameters
    ----------
    cov : ndarray of shape (n, n)
        Covariance matrix.

    Returns
    -------
    corr, std : tuple[ndarray of shape (n, n), ndarray of shape (n, )]
        Correlation matrix and standard-deviation vector
    """
    if cov.ndim != 2:
        raise ValueError(f"`cov` must be a 2D array, got a {cov.ndim}D array")
    std = np.sqrt(np.diag(cov))
    corr = cov / std / std[:, None]
    return corr, std




[docs]
def corr_to_cov(corr: np.ndarray, std: np.ndarray):
    """Convert a correlation matrix to a covariance matrix given its
    standard-deviation vector.

    Parameters
    ----------
    corr : ndarray of shape (n, n)
        Correlation matrix.

    std : ndarray of shape (n, )
        Standard-deviation vector.

    Returns
    -------
    cov : ndarray of shape (n, n)
        Covariance matrix
    """
    if std.ndim != 1:
        raise ValueError(f"`std` must be a 1D array, got a {std.ndim}D array")
    if corr.ndim != 2:
        raise ValueError(f"`corr` must be a 2D array, got a {corr.ndim}D array")
    cov = corr * std * std[:, None]
    return cov



_CLIPPING_VALUE = 1e-13



[docs]
def cov_nearest(
    cov: np.ndarray,
    higham: bool = False,
    higham_max_iteration: int = 100,
    warn: bool = False,
):
    """Compute the nearest covariance matrix that is positive definite and with a
    cholesky decomposition than can be computed. The variance is left unchanged.
    A covariance matrix that is not positive definite often occurs in high
    dimensional problems. It can be due to multicollinearity, floating-point
    inaccuracies, or when the number of observations is smaller than the number of
    assets.

    First, it converts the covariance matrix to a correlation matrix.
    Then, it finds the nearest correlation matrix and converts it back to a covariance
    matrix using the initial standard deviation.

    Cholesky decomposition can fail for symmetric positive definite (SPD) matrix due
    to floating point error and inversely, Cholesky decomposition can success for
    non-SPD matrix. Therefore, we need to test for both. We always start by testing
    for Cholesky decomposition which is significantly faster than checking for positive
    eigenvalues.

    Parameters
    ----------
    cov : ndarray of shape (n, n)
        Covariance matrix.

    higham : bool, default=False
        If this is set to True, the Higham & Nick (2002) algorithm [1]_ is used,
        otherwise the eigenvalues are clipped to threshold above zeros (1e-13).
        The default (`False`) is to use the clipping method as the Higham & Nick
        algorithm can be slow for large datasets.

    higham_max_iteration : int, default=100
        Maximum number of iteration of the Higham & Nick (2002) algorithm.
        The default value is `100`.

    warn : bool, default=False
        If this is set to True, a user warning is emitted when the covariance matrix
        is not positive definite and replaced by the nearest. The default is False.

    Returns
    -------
    cov : ndarray
        The nearest covariance matrix.

    References
    ----------
    .. [1] "Computing the nearest correlation matrix - a problem from finance"
        IMA Journal of Numerical Analysis
        Higham & Nick (2002)
    """
    assert_is_square(cov)
    assert_is_symmetric(cov)

    # Around 100 times faster than checking eigenvalues with np.linalg.eigh
    if is_cholesky_dec(cov) and is_positive_definite(cov):
        return cov

    if warn:
        warnings.warn(
            "The covariance matrix is not positive definite. "
            f"The {'Higham' if higham else 'Clipping'} algorithm will be used to find "
            "the nearest positive definite covariance.",
            stacklevel=2,
        )
    corr, std = cov_to_corr(cov)

    if higham:
        eps = np.finfo(np.float64).eps * 5
        diff = np.zeros(corr.shape)
        x = corr.copy()
        for _ in range(higham_max_iteration):
            x_adj = x - diff
            eig_vals, eig_vecs = np.linalg.eigh(x_adj)
            x = eig_vecs * np.maximum(eig_vals, eps) @ eig_vecs.T
            diff = x - x_adj
            np.fill_diagonal(x, 1)
            cov = corr_to_cov(x, std)
            if is_cholesky_dec(cov) and is_positive_definite(cov):
                break
        else:
            raise ValueError("Unable to find the nearest positive definite matrix")
    else:
        eig_vals, eig_vecs = np.linalg.eigh(corr)
        # Clipping the eigenvalues with a value smaller than 1e-13 can cause scipy to
        # consider the matrix non-psd is some corner cases (see test/test_stats.py)
        x = eig_vecs * np.maximum(eig_vals, _CLIPPING_VALUE) @ eig_vecs.T
        x, _ = cov_to_corr(x)
        cov = corr_to_cov(x, std)

    return cov




[docs]
def commutation_matrix(x):
    """Compute the commutation matrix.

    Parameters
    ----------
    x : ndarray of shape (n,  m)
        The matrix.

    Returns
    -------
    K : ndarray of shape (m * n, m * n)
        The commutation matrix.
    """
    (m, n) = x.shape
    row = np.arange(m * n)
    col = row.reshape((m, n), order="F").ravel()
    data = np.ones(m * n, dtype=np.int8)
    k = csr_matrix((data, (row, col)), shape=(m * n, m * n))
    return k




[docs]
def compute_optimal_n_clusters(distance: np.ndarray, linkage_matrix: np.ndarray) -> int:
    r"""Compute the optimal number of clusters based on Two-Order Difference to Gap
    Statistic [1]_.

    The Two-Order Difference to Gap Statistic has been developed to improve the
    performance and stability of the Tibshiranis Gap statistic.
    It applies the two-order difference of the within-cluster dispersion to replace the
    reference null distribution in the Gap statistic.

    The number of cluster :math:`k` is determined by:

    .. math::  \begin{cases}
                \begin{aligned}
                &\max_{k} & & W_{k+2} + W_{k} - 2 W_{k+1} \\
                &\text{s.t.} & & 1 \ge c \ge max\bigl(8, \sqrt{n}\bigr) \\
                \end{aligned}
                \end{cases}

    with :math:`n` the sample size and :math:`W_{k}` the within-cluster dispersions
    defined as:

    .. math:: W_{k} = \sum_{i=1}^{k} \frac{D_{i}}{2|C_{i}|}

    where :math:`|C_{i}|` is the cardinality of cluster :math:`i` and :math:`D_{i}` its
    density defined as:

    .. math:: D_{i} = \sum_{u \in C_{i}} \sum_{v \in C_{i}} d(u,v)

    with :math:`d(u,v)` the distance between u and v.


    Parameters
    ----------
    distance : ndarray of shape (n, n)
        Distance matrix.

    linkage_matrix : ndarray of shape (n - 1, 4)
        Linkage matrix.

    Returns
    -------
    value : int
        Optimal number of clusters.

    References
    ----------
    .. [1] "Application of two-order difference to gap statistic".
        Yue, Wang & Wei (2009)
    """
    cut_tree = sch.cut_tree(linkage_matrix)
    n = cut_tree.shape[1]
    max_clusters = min(n, max(8, round(np.sqrt(n))))
    dispersion = []
    for k in range(max_clusters):
        level = cut_tree[:, n - k - 1]
        cluster_density = []
        for i in range(np.max(level) + 1):
            cluster_idx = np.argwhere(level == i).flatten()
            cluster_dists = scd.squareform(
                distance[cluster_idx, :][:, cluster_idx], checks=False
            )
            if cluster_dists.shape[0] != 0:
                cluster_density.append(np.nan_to_num(cluster_dists.mean()))
        dispersion.append(np.sum(cluster_density))
    dispersion = np.array(dispersion)
    gaps = np.roll(dispersion, -2) + dispersion - 2 * np.roll(dispersion, -1)
    gaps = gaps[:-2]
    # k=0 represents one cluster
    k = np.argmax(gaps) + 2
    return k




[docs]
def minimize_relative_weight_deviation(
    weights: np.ndarray,
    min_weights: np.ndarray,
    max_weights: np.ndarray,
    solver: str = "CLARABEL",
    solver_params: dict | None = None,
) -> np.ndarray:
    r"""
    Apply weight constraints to an initial array of weights by minimizing the relative
    weight deviation of the final weights from the initial weights.

    .. math::
            \begin{cases}
            \begin{aligned}
            &\min_{w} & & \Vert \frac{w - w_{init}}{w_{init}} \Vert_{2}^{2} \\
            &\text{s.t.} & & \sum_{i=1}^{N} w_{i} = 1 \\
            & & & w_{min} \leq w_i \leq w_{max}, \quad \forall i
            \end{aligned}
            \end{cases}

    Parameters
    ----------
    weights : ndarray of shape (n_assets,)
        Initial weights.

    min_weights : ndarray of shape (n_assets,)
        Minimum assets weights (weights lower bounds).

    max_weights : ndarray of shape (n_assets,)
        Maximum assets weights (weights upper bounds).

    solver : str, default="CLARABEL"
        The solver to use. The default is "CLARABEL" which is written in Rust and has
        better numerical stability and performance than ECOS and SCS.
        For more details about available solvers, check the CVXPY documentation:
        https://www.cvxpy.org/tutorial/advanced/index.html#choosing-a-solver

    solver_params : dict, optional
        Solver parameters. For example, `solver_params=dict(verbose=True)`.
        The default (`None`) is to use the CVXPY default.
        For more details about solver arguments, check the CVXPY documentation:
        https://www.cvxpy.org/tutorial/advanced/index.html#setting-solver-options
    """
    if not (weights.shape == min_weights.shape == max_weights.shape):
        raise ValueError("`min_weights` and `max_weights` must have same size")

    if np.any(weights < 0):
        raise ValueError("Initial weights must be strictly positive")

    if not np.isclose(np.sum(weights), 1.0):
        raise ValueError("Initial weights must sum to one")

    if np.any(max_weights < min_weights):
        raise ValueError("`min_weights` must be lower or equal to `max_weights`")

    if np.all((weights >= min_weights) & (weights <= max_weights)):
        return weights

    if solver_params is None:
        solver_params = {}

    n = len(weights)
    w = cp.Variable(n)

    objective = cp.Minimize(cp.norm(w / weights - 1))
    constraints = [cp.sum(w) == 1, w >= min_weights, w <= max_weights]
    problem = cp.Problem(objective, constraints)

    try:
        problem.solve(solver=solver, **solver_params)

        if w.value is None:
            raise cp.SolverError("No solution found")

    except (cp.SolverError, scl.ArpackNoConvergence):
        raise cp.SolverError(
            f"Solver '{solver}' failed. Try another"
            " solver, or solve with solver_params=dict(verbose=True) for more"
            " information"
        ) from None

    return w.value