Source code for skfolio.moments.covariance._base

"""Base Covariance Estimators."""

# Copyright (c) 2023
# Author: Hugo Delatte <delatte.hugo@gmail.com>
# License: BSD 3 clause
# Implementation derived from:
# scikit-learn, Copyright (c) 2007-2010 David Cournapeau, Fabian Pedregosa, Olivier
# Grisel Licensed under BSD 3 clause.

from abc import ABC, abstractmethod

import numpy as np
import numpy.typing as npt
import sklearn.base as skb

from skfolio.exceptions import NonPositiveVarianceError
from skfolio.utils.stats import cov_nearest


[docs] class BaseCovariance(skb.BaseEstimator, ABC): """Base class for all covariance estimators in `skfolio`. Parameters ---------- nearest : bool, default=True If this is set to True, the covariance is replaced by the nearest covariance matrix that is positive definite and with a Cholesky decomposition than can be computed. The variance is left unchanged. A covariance matrix that is not positive definite often occurs in high dimensional problems. It can be due to multicollinearity, floating-point inaccuracies, or when the number of observations is smaller than the number of assets. For more details, see :func:`~skfolio.utils.stats.cov_nearest`. The default is `True`. higham : bool, default=False If this is set to True, the Higham & Nick (2002) algorithm is used to find the nearest PD covariance, otherwise the eigenvalues are clipped to a threshold above zeros (1e-13). The default is `False` and use the clipping method as the Higham & Nick algorithm can be slow for large datasets. higham_max_iteration : int, default=100 Maximum number of iteration of the Higham & Nick (2002) algorithm. The default value is `100`. Attributes ---------- covariance_ : ndarray of shape (n_assets, n_assets) Estimated covariance matrix. Notes ----- All estimators should specify all the parameters that can be set at the class level in their ``__init__`` as explicit keyword arguments (no ``*args`` or ``**kwargs``). """ covariance_: np.ndarray @abstractmethod def __init__( self, nearest: bool = True, higham: bool = False, higham_max_iteration: int = 100, ): self.nearest = nearest self.higham = higham self.higham_max_iteration = higham_max_iteration @abstractmethod def fit(self, X: npt.ArrayLike, y=None): pass def _sanity_check(self, covariance: np.ndarray) -> None: """Perform a sanity check on the covariance matrix by verifying that all diagonal elements are strictly positive. The goal is to early detect corrupted asset data (with zero variance) that would lead to optimizations errors. """ cond = np.diag(covariance) < 1e-15 if np.any(cond): corrupted_assets = list(np.argwhere(cond).flatten()) detail = "assets indices" if hasattr(self, "feature_names_in_"): corrupted_assets = list(self.feature_names_in_[corrupted_assets]) detail = "assets" raise NonPositiveVarianceError( f"The following {detail} have a non positive variance:" f" {corrupted_assets}" ) def _set_covariance(self, covariance: np.ndarray) -> None: """Perform checks, convert to nearest PSD if specified and saves the covariance. Parameters ---------- covariance : array-like of shape (n_assets, n_assets) Estimated covariance matrix to be stored. """ self._sanity_check(covariance) if self.nearest: covariance = cov_nearest( covariance, higham=self.higham, higham_max_iteration=self.higham_max_iteration, warn=True, ) # set covariance self.covariance_ = covariance