Source code for skfolio.distribution.copula._selection

"""Bivariate Copula Selection."""

# Copyright (c) 2025
# Authors: The skfolio developers
# Credits: Matteo Manzi, Vincent Maladière, Carlo Nicolini
# SPDX-License-Identifier: BSD-3-Clause

import numpy as np
import numpy.typing as npt
import scipy.stats as st
import sklearn as sk

from skfolio.distribution._base import SelectionCriterion
from skfolio.distribution.copula._base import BaseBivariateCopula
from skfolio.distribution.copula._clayton import ClaytonCopula
from skfolio.distribution.copula._gaussian import GaussianCopula
from skfolio.distribution.copula._gumbel import GumbelCopula
from skfolio.distribution.copula._independent import IndependentCopula
from skfolio.distribution.copula._joe import JoeCopula
from skfolio.distribution.copula._student_t import StudentTCopula



[docs]
def select_bivariate_copula(
    X: npt.ArrayLike,
    copula_candidates: list[BaseBivariateCopula] | None = None,
    selection_criterion: SelectionCriterion = SelectionCriterion.AIC,
    independence_level: float = 0.05,
) -> BaseBivariateCopula:
    """
    Select the best bivariate copula from a list of candidates using an information
    criterion.

    This function first tests the dependence between the two variables in X using
    Kendall's tau independence test. If the p-value is greater than or equal to
    `independence_level`, the null hypothesis of independence is not rejected, and the
    `IndependentCopula` is returned. Otherwise, each candidate copula in
    `copula_candidates` is fitted to the data X. For each candidate, either the
    Akaike Information Criterion (AIC) or the Bayesian Information Criterion (BIC) is
    computed, and the copula with the lowest criterion value is selected.

    Parameters
    ----------
    X : array-like of shape (n_observations, 2)
        An array of bivariate inputs (u, v) with uniform marginals (values in [0, 1]).

    copula_candidates : list[BaseBivariateCopula]
        A list of candidate copula models. Each candidate must inherit from
        `BaseBivariateCopula`. If None, defaults to
        `[GaussianCopula(), StudentTCopula(), ClaytonCopula(), GumbelCopula(), JoeCopula()]`.

    selection_criterion : SelectionCriterion, default=SelectionCriterion.AIC
        The criterion used for model selection. Possible values are:
            - SelectionCriterion.AIC : Akaike Information Criterion
            - SelectionCriterion.BIC : Bayesian Information Criterion

    independence_level : float, default=0.05
        The significance level for the Kendall tau independence test. If the p-value is
        greater than or equal to this level, the independence hypothesis is not
        rejected, and the `IndependentCopula` is returned.

    Returns
    -------
    selected_copula : BaseBivariateCopula
        The fitted copula model among the candidates that minimizes the selected
        information criterion (AIC or BIC).

    Raises
    ------
    ValueError
        If X is not a 2D array with exactly two columns, or if any candidate in
        `copula_candidates` does not inherit from `BaseBivariateCopula`.
    """
    if copula_candidates is None:
        copula_candidates = [
            GaussianCopula(),
            StudentTCopula(),
            ClaytonCopula(),
            GumbelCopula(),
            JoeCopula(),
        ]

    X = np.asarray(X)
    if X.ndim != 2 or X.shape[1] != 2:
        raise ValueError("X must contains two columns for Bivariate Copula")

    kendall_tau, p_value = st.kendalltau(X[:, 0], X[:, 1])
    if p_value >= independence_level:
        return IndependentCopula().fit(X)

    results = {}
    for copula in copula_candidates:
        if not isinstance(copula, BaseBivariateCopula):
            raise ValueError(
                "The candidate copula must inherit from `BaseBivariateCopula`"
            )
        copula = sk.clone(copula)
        if copula.itau and copula.kendall_tau is None:
            # Faster computation by reusing kendall tau if itau
            copula.kendall_tau = kendall_tau
        copula.fit(X)

        match selection_criterion:
            case selection_criterion.AIC:
                results[copula] = copula.aic(X)
            case selection_criterion.BIC:
                results[copula] = copula.bic(X)
            case _:
                raise ValueError(f"{selection_criterion} not implemented")

    selected_copula = min(results, key=results.get)
    return selected_copula