Source code for skfolio.optimization.cluster.hierarchical._base

"""Base Hierarchical Clustering Optimization estimator."""

# Copyright (c) 2023
# Author: Hugo Delatte <delatte.hugo@gmail.com>
# SPDX-License-Identifier: BSD-3-Clause
# Implementation derived from:
# Riskfolio-Lib, Copyright (c) 2020-2023, Dany Cajas, Licensed under BSD 3 clause.
# scikit-learn, Copyright (c) 2007-2010 David Cournapeau, Fabian Pedregosa, Olivier

from abc import ABC, abstractmethod
from typing import Any

import numpy as np
import numpy.typing as npt
import sklearn.utils.metadata_routing as skm

import skfolio.typing as skt
from skfolio.cluster import HierarchicalClustering
from skfolio.distance import BaseDistance
from skfolio.measures import ExtraRiskMeasure, RiskMeasure
from skfolio.optimization._base import BaseOptimization
from skfolio.portfolio import Portfolio
from skfolio.prior import BasePrior, ReturnDistribution
from skfolio.utils.tools import input_to_array



[docs]
class BaseHierarchicalOptimization(BaseOptimization, ABC):
    r"""Base Hierarchical Clustering Optimization estimator.

    Parameters
    ----------
    risk_measure : RiskMeasure or ExtraRiskMeasure, default=RiskMeasure.VARIANCE
        :class:`~skfolio.meta.RiskMeasure` or :class:`~skfolio.meta.ExtraRiskMeasure`
        of the optimization.
        Can be any of:

            * MEAN_ABSOLUTE_DEVIATION
            * FIRST_LOWER_PARTIAL_MOMENT
            * VARIANCE
            * SEMI_VARIANCE
            * CVAR
            * EVAR
            * WORST_REALIZATION
            * CDAR
            * MAX_DRAWDOWN
            * AVERAGE_DRAWDOWN
            * EDAR
            * ULCER_INDEX
            * GINI_MEAN_DIFFERENCE_RATIO
            * VALUE_AT_RISK
            * DRAWDOWN_AT_RISK
            * ENTROPIC_RISK_MEASURE
            * FOURTH_CENTRAL_MOMENT
            * FOURTH_LOWER_PARTIAL_MOMENT

        The default is `RiskMeasure.VARIANCE`.

    prior_estimator : BasePrior, optional
        :ref:`Prior estimator <prior>`.
        The prior estimator is used to estimate the :class:`~skfolio.prior.ReturnDistribution`
        containing the estimation of assets expected returns, covariance matrix and
        returns. The moments and returns estimations are used for the risk computation
        and the returns estimation are used by the distance matrix estimator.
        The default (`None`) is to use :class:`~skfolio.prior.EmpiricalPrior`.

    distance_estimator : BaseDistance, optional
        :ref:`Distance estimator <distance>`.
        The distance estimator is used to estimate the codependence and the distance
        matrix needed for the computation of the linkage matrix.
        The default (`None`) is to use :class:`~skfolio.distance.PearsonDistance`.

    hierarchical_clustering_estimator : HierarchicalClustering, optional
        :ref:`Hierarchical Clustering estimator <hierarchical_clustering>`.
        The hierarchical clustering estimator is used to compute the linkage matrix
        and the hierarchical clustering of the assets based on the distance matrix.
        The default (`None`) is to use
        :class:`~skfolio.cluster.HierarchicalClustering`.

    min_weights : float | dict[str, float] | array-like of shape (n_assets, ), default=0.0
        Minimum assets weights (weights lower bounds). Negative weights are not allowed.
        If a float is provided, it is applied to each asset.
        If a dictionary is provided, its (key/value) pair must be the
        (asset name/asset minium weight) and the input `X` of the `fit` methods must be
        a DataFrame with the assets names in columns.
        When using a dictionary, assets values that are not provided are assigned a
        minimum weight of `0.0`. The default is 0.0 (no short selling).

        Example:

           * min_weights = 0 --> long only portfolio (no short selling).
           * min_weights = None --> no lower bound (same as `-np.Inf`).
           * min_weights = {"SX5E": 0, "SPX": 0.1}
           * min_weights = [0, 0.1]

    max_weights : float | dict[str, float] | array-like of shape (n_assets, ), default=1.0
        Maximum assets weights (weights upper bounds). Weights above 1.0 are not
        allowed. If a float is provided, it is applied to each asset.
        If a dictionary is provided, its (key/value) pair must be the
        (asset name/asset maximum weight) and the input `X` of the `fit` method must be
        a DataFrame with the assets names in columns.
        When using a dictionary, assets values that are not provided are assigned a
        minimum weight of `1.0`. The default is 1.0 (each asset is below 100%).

        Example:

           * max_weights = 0 --> no long position (short only portfolio).
           * max_weights = 0.5 --> each weight must be below 50%.
           * max_weights = {"SX5E": 1, "SPX": 0.25}
           * max_weights = [1, 0.25]

    transaction_costs : float | dict[str, float] | array-like of shape (n_assets, ), default=0.0
        Transaction costs of the assets. It is used to add linear transaction costs to
        the optimization problem:

        .. math:: total\_cost = \sum_{i=1}^{N} c_{i} \times |w_{i} - w\_prev_{i}|

        with :math:`c_{i}` the transaction cost of asset i, :math:`w_{i}` its weight
        and :math:`w\_prev_{i}` its previous weight (defined in `previous_weights`).
        The float :math:`total\_cost` is impacting the portfolio expected return in the optimization:

        .. math:: expected\_return = \mu^{T} \cdot w - total\_cost

        with :math:`\mu` the vector af assets' expected returns and :math:`w` the
        vector of assets weights.

        If a float is provided, it is applied to each asset.
        If a dictionary is provided, its (key/value) pair must be the
        (asset name/asset cost) and the input `X` of the `fit` method must be a
        DataFrame with the assets names in columns.
        The default value is `0.0`.

        .. warning::

            Based on the above formula, the periodicity of the transaction costs
            needs to be homogenous to the periodicity of :math:`\mu`. For example, if
            the input `X` is composed of **daily** returns, the `transaction_costs` need
            to be expressed as **daily** costs.
            (See :ref:`sphx_glr_auto_examples_mean_risk_plot_6_transaction_costs.py`)

    management_fees : float | dict[str, float] | array-like of shape (n_assets, ), default=0.0
        Management fees of the assets. It is used to add linear management fees to the
        optimization problem:

        .. math:: total\_fee = \sum_{i=1}^{N} f_{i} \times w_{i}

        with :math:`f_{i}` the management fee of asset i and :math:`w_{i}` its weight.
        The float :math:`total\_fee` is impacting the portfolio expected return in the optimization:

        .. math:: expected\_return = \mu^{T} \cdot w - total\_fee

        with :math:`\mu` the vector af assets expected returns and :math:`w` the vector
        of assets weights.

        If a float is provided, it is applied to each asset.
        If a dictionary is provided, its (key/value) pair must be the
        (asset name/asset fee) and the input `X` of the `fit` method must be a
        DataFrame with the assets names in columns.
        The default value is `0.0`.

        .. warning::

            Based on the above formula, the periodicity of the management fees needs to
            be homogenous to the periodicity of :math:`\mu`. For example, if the input
            `X` is composed of **daily** returns, the `management_fees` need to be
            expressed in **daily** fees.

        .. note::

            Another approach is to directly impact the management fees to the input `X`
            in order to express the returns net of fees. However, when estimating the
            :math:`\mu` parameter using for example Shrinkage estimators, this approach
            would mix a deterministic value with an uncertain one leading to unwanted
            bias in the management fees.

    previous_weights : float | dict[str, float] | array-like of shape (n_assets, ), optional
        Previous weights of the assets. Previous weights are used to compute the
        portfolio total cost. If a float is provided, it is applied to each asset.
        If a dictionary is provided, its (key/value) pair must be the
        (asset name/asset previous weight) and the input `X` of the `fit` method must
        be a DataFrame with the assets names in columns.
        The default (`None`) means no previous weights.

    portfolio_params :  dict, optional
        Portfolio parameters passed to the portfolio evaluated by the `predict` and
        `score` methods. If not provided, the `name`, `transaction_costs`,
        `management_fees`, `previous_weights` and `risk_free_rate` are copied from the
        optimization model and passed to the portfolio.

    Attributes
    ----------
    weights_ : ndarray of shape (n_assets,)
        Weights of the assets.

    prior_estimator_ : BasePrior
        Fitted `prior_estimator`.

    distance_estimator_ : BaseDistance
        Fitted `distance_estimator`.

    hierarchical_clustering_estimator_ : HierarchicalClustering
        Fitted `hierarchical_clustering_estimator`.
    """

    prior_estimator_: BasePrior
    distance_estimator_: BaseDistance
    hierarchical_clustering_estimator_: HierarchicalClustering

    @abstractmethod
    def __init__(
        self,
        risk_measure: RiskMeasure | ExtraRiskMeasure = RiskMeasure.VARIANCE,
        prior_estimator: BasePrior | None = None,
        distance_estimator: BaseDistance | None = None,
        hierarchical_clustering_estimator: HierarchicalClustering | None = None,
        min_weights: skt.MultiInput | None = 0.0,
        max_weights: skt.MultiInput | None = 1.0,
        transaction_costs: skt.MultiInput = 0.0,
        management_fees: skt.MultiInput = 0.0,
        previous_weights: skt.MultiInput | None = None,
        portfolio_params: dict | None = None,
    ):
        super().__init__(portfolio_params=portfolio_params)
        self.risk_measure = risk_measure
        self.prior_estimator = prior_estimator
        self.distance_estimator = distance_estimator
        self.hierarchical_clustering_estimator = hierarchical_clustering_estimator
        self.min_weights = min_weights
        self.max_weights = max_weights
        self.transaction_costs = transaction_costs
        self.management_fees = management_fees
        self.previous_weights = previous_weights
        self._seriated = False

    def _clean_input(
        self,
        value: float | dict | np.ndarray | list,
        n_assets: int,
        fill_value: Any,
        name: str,
    ) -> np.ndarray:
        """Convert input to cleaned 1D array
         value : float, dict, array-like or None.
            Input value to clean and convert.

        Parameters
        ----------
        value : float, dict or array-like.
            Input value to clean.

        n_assets : int
            Number of assets. Used to verify the shape of the converted array.

        fill_value : Any
            When `items` is a dictionary, elements that are not in `asset_names` are
            filled with `fill_value` in the converted array.

        name : str
            Name used for error messages.

        Returns
        -------
        value :  ndarray of shape (n_assets,)
            The cleaned float or 1D array.
        """
        if value is None:
            raise ValueError("Cannot convert None to array")
        if np.isscalar(value):
            return value * np.ones(n_assets)
        return input_to_array(
            items=value,
            n_assets=n_assets,
            fill_value=fill_value,
            dim=1,
            assets_names=(
                self.feature_names_in_ if hasattr(self, "feature_names_in_") else None
            ),
            name=name,
        )

    def _risk(
        self,
        weights: np.ndarray,
        return_distribution: ReturnDistribution,
    ) -> float:
        """Compute the risk measure of a theoretical portfolio defined by the weights
        vector.

        Parameters
        ----------
        weights : ndarray of shape (n_assets,)
           The vector of weights.

        return_distribution : ReturnDistribution
            The assets return distribution.

        Returns
        -------
        risk: float
            The risk measure of a theoretical portfolio defined by the weights
            vector.
        """
        ptf = Portfolio(
            X=return_distribution.returns,
            sample_weight=return_distribution.sample_weight,
            weights=weights,
            transaction_costs=self.transaction_costs,
            management_fees=self.management_fees,
            previous_weights=self.previous_weights,
        )
        if self.risk_measure in [RiskMeasure.VARIANCE, RiskMeasure.STANDARD_DEVIATION]:
            risk = ptf.variance_from_assets(
                assets_covariance=return_distribution.covariance
            )
            if self.risk_measure == RiskMeasure.STANDARD_DEVIATION:
                risk = np.sqrt(risk)
        else:
            risk = getattr(ptf, str(self.risk_measure.value))
        return risk

    def _unitary_risks(self, return_distribution: ReturnDistribution) -> np.ndarray:
        """Compute the vector of risk measure for each single assets.

        Parameters
        ----------
        return_distribution : ReturnDistribution
            The asset returns distribution.

        Returns
        -------
        values: ndarray of shape (n_assets,)
            The risk measure of each asset.
        """
        n_assets = return_distribution.returns.shape[1]
        risks = [
            self._risk(weights=weights, return_distribution=return_distribution)
            for weights in np.identity(n_assets)
        ]
        return np.array(risks)

    def _convert_weights_bounds(self, n_assets: int) -> tuple[np.ndarray, np.ndarray]:
        """Convert the input weights lower and upper bounds to two 1D arrays.

        Parameters
        ----------
        n_assets : int
            Number of assets.

        Returns
        -------
        min_weights : ndarray of shape (n_assets,)
            The weight lower bound 1D array.
        max_weights : ndarray of shape (n_assets,)
            The weight upper bound 1D array.
        """
        if self.min_weights is None:
            min_weights = np.zeros(n_assets)
        else:
            min_weights = self._clean_input(
                self.min_weights,
                n_assets=n_assets,
                fill_value=0,
                name="min_weights",
            )
            if np.any(min_weights < 0):
                raise ValueError("`min_weights` must be strictly positive")

        if self.max_weights is None:
            max_weights = np.ones(n_assets)
        else:
            max_weights = self._clean_input(
                self.max_weights,
                n_assets=n_assets,
                fill_value=1,
                name="max_weights",
            )
            if np.any(max_weights > 1):
                raise ValueError("`max_weights` must be less than or equal to 1.0")
            if np.sum(max_weights) < 1:
                raise ValueError(
                    "The sum of `max_weights` must be greater than or equal to 1.0"
                )

        if np.any(min_weights > max_weights):
            raise NameError(
                "Items of `min_weights` must be less than or equal to items of"
                " `max_weights`"
            )

        return min_weights, max_weights


[docs]
    def get_metadata_routing(self):
        # noinspection PyTypeChecker
        router = (
            skm.MetadataRouter(owner=self.__class__.__name__)
            .add(
                prior_estimator=self.prior_estimator,
                method_mapping=skm.MethodMapping().add(caller="fit", callee="fit"),
            )
            .add(
                distance_estimator=self.distance_estimator,
                method_mapping=skm.MethodMapping().add(caller="fit", callee="fit"),
            )
            .add(
                hierarchical_clustering_estimator=self.hierarchical_clustering_estimator,
                method_mapping=skm.MethodMapping().add(caller="fit", callee="fit"),
            )
        )
        return router


    @abstractmethod
    def fit(self, X: npt.ArrayLike, y: None = None, **fit_params):
        pass