Source code for skfolio.distribution.univariate._selection

"""Univariate Distribution Selection."""

# Copyright (c) 2025
# Authors: The skfolio developers
# Credits: Matteo Manzi, Vincent Maladière, Carlo Nicolini
# SPDX-License-Identifier: BSD-3-Clause

import numpy as np
import numpy.typing as npt
import sklearn as sk

from skfolio.distribution._base import SelectionCriterion
from skfolio.distribution.univariate._base import BaseUnivariateDist
from skfolio.distribution.univariate._gaussian import Gaussian
from skfolio.distribution.univariate._johnson_su import JohnsonSU
from skfolio.distribution.univariate._student_t import StudentT


[docs] def select_univariate_dist( X: npt.ArrayLike, distribution_candidates: list[BaseUnivariateDist] | None = None, selection_criterion: SelectionCriterion = SelectionCriterion.AIC, ) -> BaseUnivariateDist: """Select the optimal univariate distribution estimator based on an information criterion. For each candidate distribution, the function fits the distribution to X and then computes either the Akaike Information Criterion (AIC) or the Bayesian Information Criterion (BIC). The candidate with the lowest criterion value is returned. Parameters ---------- X : array-like of shape (n_observations, 1) The input data used to fit each candidate distribution. distribution_candidates : list of BaseUnivariateDist A list of candidate distribution estimators. Each candidate must be an instance of a class that inherits from `BaseUnivariateDist`. If None, defaults to `[Gaussian(), StudentT(), JohnsonSU()]`. selection_criterion : SelectionCriterion, default=SelectionCriterion.AIC The criterion used for model selection. Possible values are: - SelectionCriterion.AIC : Akaike Information Criterion - SelectionCriterion.BIC : Bayesian Information Criterion Returns ------- BaseUnivariateDist The fitted candidate estimator that minimizes the selected information criterion. Raises ------ ValueError If X does not have exactly one column or if any candidate in the list does not inherit from BaseUnivariateDist. """ if distribution_candidates is None: distribution_candidates = [ Gaussian(), StudentT(), JohnsonSU(), ] X = np.asarray(X) if X.ndim != 2 or X.shape[1] != 1: raise ValueError("X must contains one column for Univariate Distribution") results = {} for dist in distribution_candidates: if not isinstance(dist, BaseUnivariateDist): raise ValueError("Each candidate must inherit from `BaseUnivariateDist`") dist = sk.clone(dist) dist.fit(X) match selection_criterion: case selection_criterion.AIC: results[dist] = dist.aic(X) case selection_criterion.BIC: results[dist] = dist.bic(X) case _: raise ValueError(f"{selection_criterion} not implemented") selected_dist = min(results, key=results.get) return selected_dist