"""Distance Estimators"""
# Copyright (c) 2023
# Author: Hugo Delatte <delatte.hugo@gmail.com>
# License: BSD 3 clause
import numpy as np
import numpy.typing as npt
import pandas as pd
import scipy.spatial.distance as scd
import scipy.stats as sct
import sklearn.metrics as skmc
import sklearn.utils.metadata_routing as skm
from skfolio.distance._base import BaseDistance
from skfolio.moments import BaseCovariance, GerberCovariance
from skfolio.utils.stats import (
NBinsMethod,
cov_to_corr,
n_bins_freedman,
n_bins_knuth,
)
from skfolio.utils.tools import check_estimator
[docs]
class PearsonDistance(BaseDistance):
r"""Pearson Distance estimator.
The codependence is computed from the Pearson correlation to which is applied a
power and/or absolute transformation.
This codependence is then used to compute the distance matrix.
Some widely used distances are:
* Standard angular distance = :math:`\sqrt{0.5 \times (1 - corr)}`
* Absolute angular distance = :math:`\sqrt{1 - |corr|}`
* Squared angular distance = :math:`\sqrt{1 - corr^2}`
Parameters
----------
absolute : bool, default=False
If this is set to True, the absolute transformation is applied to the
correlation matrix.
power : float, default=1
Exponent of the power transformation applied to the correlation matrix.
Attributes
----------
codependence_ : ndarray of shape (n_assets, n_assets)
Codependence matrix.
distance_ : ndarray of shape (n_assets, n_assets)
Distance matrix.
n_features_in_ : int
Number of assets seen during `fit`.
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of assets seen during `fit`. Defined only when `X`
has assets names that are all strings.
References
----------
.. [1] "Building Diversified Portfolios that Outperform Out-of-Sample",
Lòpez de Prado, Journal of Portfolio Management (2016)
"""
def __init__(self, absolute: bool = False, power: float = 1):
self.absolute = absolute
self.power = power
[docs]
def fit(self, X: npt.ArrayLike, y=None) -> "PearsonDistance":
"""Fit the Pearson Distance estimator.
Parameters
----------
X : array-like of shape (n_observations, n_assets)
Price returns of the assets.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : PearsonDistance
Fitted estimator.
"""
X = self._validate_data(X)
corr = np.corrcoef(X.T)
self.codependence_, self.distance_ = _corr_to_distance(
corr, absolute=self.absolute, power=self.power
)
return self
[docs]
class KendallDistance(BaseDistance):
r"""Kendall Distance estimator.
The codependence is computed from the Kendall correlation to which is applied a
power and/or absolute transformation.
This codependence is then used to compute the distance matrix.
Some widely used distances are:
* Standard angular distance = :math:`\sqrt{0.5 \times (1 - corr)}`
* Absolute angular distance = :math:`\sqrt{1 - |corr|}`
* Squared angular distance = :math:`\sqrt{1 - corr^2}`
Parameters
----------
absolute : bool, default=False
If this is set to True, the absolute transformation is applied to the
correlation matrix.
The default is `False`.
power : float, default=1
Exponent of the power transformation applied to the correlation matrix.
The default value is `1`.
Attributes
----------
codependence_ : ndarray of shape (n_assets, n_assets)
Codependence matrix.
distance_ : ndarray of shape (n_assets, n_assets)
Distance matrix.
n_features_in_ : int
Number of assets seen during `fit`.
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of assets seen during `fit`. Defined only when `X`
has assets names that are all strings.
References
----------
.. [1] "Building Diversified Portfolios that Outperform Out-of-Sample",
Lòpez de Prado, Journal of Portfolio Management (2016)
"""
def __init__(self, absolute: bool = False, power: float = 1):
self.absolute = absolute
self.power = power
[docs]
def fit(self, X: npt.ArrayLike, y=None) -> "KendallDistance":
"""Fit the Kendall estimator.
Parameters
----------
X : array-like of shape (n_observations, n_assets)
Price returns of the assets.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : KendallDistance
Fitted estimator.
"""
X = self._validate_data(X)
corr = pd.DataFrame(X).corr(method="kendall").to_numpy()
self.codependence_, self.distance_ = _corr_to_distance(
corr, absolute=self.absolute, power=self.power
)
return self
[docs]
class SpearmanDistance(BaseDistance):
r"""Spearman Distance estimator.
The codependence is computed from the Spearman correlation to which is applied a
power and/or absolute transformation.
This codependence is then used to compute the distance matrix.
Some widely used distances are:
* Standard angular distance = :math:`\sqrt{0.5 \times (1 - corr)}`
* Absolute angular distance = :math:`\sqrt{1 - |corr|}`
* Squared angular distance = :math:`\sqrt{1 - corr^2}`
Parameters
----------
absolute : bool, default=False
If this is set to True, the absolute transformation is applied to the
correlation matrix.
The default is `False`.
power : float, default=1
Exponent of the power transformation applied to the correlation matrix.
The default value is `1`.
Attributes
----------
codependence_ : ndarray of shape (n_assets, n_assets)
Codependence matrix.
distance_ : ndarray of shape (n_assets, n_assets)
Distance matrix.
n_features_in_ : int
Number of assets seen during `fit`.
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of assets seen during `fit`. Defined only when `X`
has assets names that are all strings.
References
----------
.. [1] "Building Diversified Portfolios that Outperform Out-of-Sample",
Lòpez de Prado, Journal of Portfolio Management (2016)
"""
def __init__(self, absolute: bool = False, power: float = 1):
self.absolute = absolute
self.power = power
[docs]
def fit(self, X: npt.ArrayLike, y=None) -> "SpearmanDistance":
"""Fit the Spearman Kendall estimator.
Parameters
----------
X : array-like of shape (n_observations, n_assets)
Price returns of the assets.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : SpearmanDistance
Fitted estimator.
"""
X = self._validate_data(X)
corr = pd.DataFrame(X).corr(method="spearman").to_numpy()
self.codependence_, self.distance_ = _corr_to_distance(
corr, absolute=self.absolute, power=self.power
)
return self
[docs]
class CovarianceDistance(BaseDistance):
r"""Covariance Distance estimator.
The codependence is computed from the correlation matrix of a chosen
:ref:`covariance estimator <covariance_estimator>` to which is applied
a power and/or absolute transformation.
This codependence is then used to compute the distance matrix.
Some widely used distances are:
* Standard angular distance = :math:`\sqrt{0.5 \times (1 - corr)}`
* Absolute angular distance = :math:`\sqrt{1 - |corr|}`
* Squared angular distance = :math:`\sqrt{1 - corr^2}`
Parameters
----------
covariance_estimator : BaseCovariance, optional
:ref:`Covariance estimator <covariance_estimator>`.
The default (`None`) is to use :class:`~skfolio.moments.GerberCovariance`.
absolute : bool, default=False
If this is set to True, the absolute transformation is applied to the
correlation matrix.
The default is `False`.
power : float, default=1
Exponent of the power transformation applied to the correlation matrix.
The default value is `1`.
Attributes
----------
codependence_ : ndarray of shape (n_assets, n_assets)
Codependence matrix.
distance_ : ndarray of shape (n_assets, n_assets)
Distance matrix.
covariance_estimator_: BaseCovariance
Fitted `covariance_estimator`
n_features_in_ : int
Number of assets seen during `fit`.
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of assets seen during `fit`. Defined only when `X`
has assets names that are all strings.
References
----------
.. [1] "Building Diversified Portfolios that Outperform Out-of-Sample",
Lòpez de Prado, Journal of Portfolio Management (2016)
"""
covariance_estimator_: BaseCovariance
def __init__(
self,
covariance_estimator: BaseCovariance | None = None,
absolute: bool = False,
power: float = 1,
):
self.covariance_estimator = covariance_estimator
self.absolute = absolute
self.power = power
[docs]
def fit(self, X: npt.ArrayLike, y=None, **fit_params) -> "CovarianceDistance":
"""Fit the Covariance Distance estimator.
Parameters
----------
X : array-like of shape (n_observations, n_assets)
Price returns of the assets.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : CovarianceDistance
Fitted estimator.
"""
routed_params = skm.process_routing(self, "fit", **fit_params)
# fitting estimators
self.covariance_estimator_ = check_estimator(
self.covariance_estimator,
default=GerberCovariance(),
check_type=BaseCovariance,
)
self.covariance_estimator_.fit(X, y, **routed_params.covariance_estimator.fit)
# we validate and convert to numpy after all models have been fitted to keep the
# features names information.
_ = self._validate_data(X)
corr, _ = cov_to_corr(self.covariance_estimator_.covariance_)
self.codependence_, self.distance_ = _corr_to_distance(
corr, absolute=self.absolute, power=self.power
)
return self
[docs]
class DistanceCorrelation(BaseDistance):
"""Distance Correlation estimator.
Distance Correlation was introduced by Szekely [1]_ to capture non-linear
dependencies.
Parameters
----------
threshold : float, default=0.5
Distance correlation threshold.
Attributes
----------
codependence_ : ndarray of shape (n_assets, n_assets)
Codependence matrix.
distance_ : ndarray of shape (n_assets, n_assets)
Distance matrix.
n_features_in_ : int
Number of assets seen during `fit`.
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of assets seen during `fit`. Defined only when `X`
has assets names that are all strings.
References
----------
.. [1] "Measuring and testing independence by correlation of distances"
Gábor J. Szekely , 2005
"""
def __init__(self, threshold: float = 0.5):
self.threshold = threshold
@staticmethod
def _dcorr(x: np.ndarray, y: np.ndarray):
"""Calculate the distance correlation between two variables"""
x = scd.squareform(scd.pdist(x.reshape(-1, 1)))
y = scd.squareform(scd.pdist(y.reshape(-1, 1)))
x = x - x.mean(axis=0)[np.newaxis, :] - x.mean(axis=1)[:, np.newaxis] + x.mean()
y = y - y.mean(axis=0)[np.newaxis, :] - y.mean(axis=1)[:, np.newaxis] + y.mean()
value = np.sqrt((x * y).sum()) / np.sqrt(
np.sqrt((x**2).sum()) * np.sqrt((y**2).sum())
)
return value
[docs]
def fit(self, X: npt.ArrayLike, y=None) -> "DistanceCorrelation":
"""Fit the Distance Correlation estimator.
Parameters
----------
X : array-like of shape (n_observations, n_assets)
Price returns of the assets.
y : Ignored
Not used, present for API consistency by convention.
Returns
-------
self : DistanceCorrelation
Fitted estimator.
"""
X = self._validate_data(X)
n_assets = X.shape[1]
corr = np.ones((n_assets, n_assets))
# TODO: parallelize
for i, j in zip(*np.triu_indices(n_assets, 1), strict=True):
corr[i, j] = self._dcorr(x=X[:, i], y=X[:, j])
corr[j, i] = corr[i, j]
self.codependence_ = corr
self.distance_ = np.sqrt(np.clip(1 - self.codependence_, a_min=0.0, a_max=1.0))
return self
def _corr_to_distance(
corr: np.ndarray, absolute: bool, power: float
) -> tuple[np.ndarray, np.ndarray]:
r"""Transform a correlation matrix to a codependence and distance matrix.
Some widely used distances are:
* Standard angular distance = :math:`\sqrt{0.5 \times (1 - corr)}`
* Absolute angular distance = :math:`\sqrt{1 - |corr|}`
* Squared angular distance = :math:`\sqrt{1 - corr^2}`
Parameters
----------
corr : ndarray of shape (n_assets, n_assets)
Correlation matrix.
absolute : bool
If this is set to True, the absolute transformation is applied to the
correlation matrix.
power : float
Exponent of the power transformation applied to the correlation matrix.
Returns
-------
codependence, distance : tuple[np.ndarray, np.ndarray]
Codependence and distance matrices.
"""
bounds = np.array([-1, 0, 1])
if absolute:
corr = np.abs(corr)
bounds = np.abs(bounds)
corr = np.power(corr, power)
bounds = np.power(bounds, power)
scaler = 1 / (1 - min(bounds))
distance = np.sqrt(np.clip(scaler * (1 - corr), a_min=0.0, a_max=1.0))
return corr, distance