Source code for skfolio.pre_selection._select_complete

"""pre-selection SelectComplete module"""

# Copyright (c) 2023
# Author: Hugo Delatte <delatte.hugo@gmail.com>
# License: BSD 3 clause

import numpy as np
import numpy.typing as npt
import sklearn.base as skb
import sklearn.feature_selection as skf
import sklearn.utils.validation as skv


[docs] class SelectComplete(skf.SelectorMixin, skb.BaseEstimator): """ Transformer to select assets with complete data across the entire observation period. This transformer removes assets (columns) that have missing values (NaNs) at the beginning or end of the period. This transformer is especially useful for financial datasets where assets (e.g., stocks, bonds) may have data gaps due to late inception (assets that started trading later), early expiry or default (assets that stopped trading before the end of the period). If missing values are not at the beginning or end but occur between non-missing values, the asset is not removed unless `drop_assets_with_internal_nan` is set to `True`. Parameters ---------- drop_assets_with_internal_nan : bool, default=False If set to True, assets with missing values (NaNs) that appear between non-missing values (i.e., internal NaNs) will also be removed. By default, only assets with leading or trailing NaNs are removed. Attributes ---------- to_keep_ : ndarray of shape (n_assets, ) Boolean array indicating which assets are remaining. n_features_in_ : int Number of assets seen during `fit`. feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during `fit`. Defined only when `X` has feature names that are all strings. Examples -------- >>> import numpy as np >>> import pandas as pd >>> from skfolio.pre_selection import SelectComplete >>> X = pd.DataFrame({ ... 'asset1': [np.nan, np.nan, 2, 3, 4], # Starts late (inception) ... 'asset2': [1, 2, 3, 4, 5], # Complete data ... 'asset3': [1, 2, 3, np.nan, 5], # Missing values within data ... 'asset4': [1, 2, 3, 4, np.nan] # Ends early (expiration) ... }) >>> selector = SelectComplete() >>> selector.fit_transform(X) array([[ 1., 1.], [ 2., 2.], [ 3., 3.], [ 4., nan], [ 5., 5.]]) >>> selector = SelectComplete(drop_assets_with_internal_nan=True) >>> selector.fit_transform(X) array([[1.], [2.], [3.], [4.], [5.]]) """ to_keep_: np.ndarray def __init__(self, drop_assets_with_internal_nan: bool = False): self.drop_assets_with_internal_nan = drop_assets_with_internal_nan
[docs] def fit(self, X: npt.ArrayLike, y=None) -> "SelectComplete": """Run the SelectComplete transformer and get the appropriate assets. Parameters ---------- X : array-like of shape (n_observations, n_assets) Returns of the assets. y : Ignored Not used, present for API consistency by convention. Returns ------- self : SelectComplete Fitted estimator. """ # Validate by allowing NaNs X = skv.validate_data(self, X, ensure_all_finite="allow-nan") if self.drop_assets_with_internal_nan: # Identify columns with any NaNs self.to_keep_ = ~np.isnan(X).any(axis=0) else: # Identify columns with no leading or trailing NaNs self.to_keep_ = ~np.isnan(X[0, :]) & ~np.isnan(X[-1, :]) return self
def _get_support_mask(self) -> np.ndarray: skv.check_is_fitted(self) return self.to_keep_ def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.allow_nan = True return tags