Source code for skfolio.datasets._base

"""Datasets module."""

# Copyright (c) 2023
# Author: Hugo Delatte <delatte.hugo@gmail.com>
# SPDX-License-Identifier: BSD-3-Clause
# Implementation derived from:
# scikit-portfolio, Copyright (c) 2022, Carlo Nicolini, Licensed under MIT Licence.
# scikit-learn, Copyright (c) 2007-2010 David Cournapeau, Fabian Pedregosa, Olivier
# Grisel Licensed under BSD 3 clause.

import gzip
import os
import shutil
import sys
import urllib.request as ur
from importlib import resources
from pathlib import Path

import joblib
import pandas as pd

DATA_MODULE = "skfolio.datasets.data"


def get_data_home(data_home: str | Path | None = None) -> str:
    """Return the path of the skfolio data directory.

    This folder is used by some large dataset loaders to avoid downloading the
    data several times.

    By default, the data directory is set to a folder named 'skfolio_data' in the
    user home folder.

    Alternatively, it can be set by the 'SKFOLIO_DATA' environment
    variable or programmatically by giving an explicit folder path. The '~'
    symbol is expanded to the user home folder.

    If the folder does not already exist, it is automatically created.

    Parameters
    ----------
    data_home : str, optional
        The path to skfolio data directory. If `None`, the default path
        is `~/skfolio_data`.

    Returns
    -------
    data_home: str or path-like, optional
        The path to skfolio data directory.
    """
    if data_home is None:
        data_home = os.environ.get("SKFOLIO_DATA", os.path.join("~", "skfolio_data"))
    data_home = os.path.expanduser(data_home)
    os.makedirs(data_home, exist_ok=True)
    return data_home


def clear_data_home(data_home: str | Path | None = None) -> None:
    """Delete all the content of the data home cache.

    Parameters
    ----------
    data_home : str or path-like, optional
        The path to scikit-learn data directory. If `None`, the default path
        is `~/skfolio_data`.
    """
    data_home = get_data_home(data_home)
    shutil.rmtree(data_home)


def load_gzip_compressed_csv_data(
    data_filename: str,
    data_module: str = DATA_MODULE,
    encoding="utf-8",
    datetime_index: bool = True,
) -> pd.DataFrame:
    """Load gzip-compressed csv files with `importlib.resources`.

    1) Open resource file with `importlib.resources.open_binary`
    2) Decompress csv file with `gzip.open`
    3) Load decompressed data with `pd.read_csv`

    Parameters
    ----------
    data_filename : str
        Name of gzip-compressed csv file  (`'*.csv.gz'`) to be loaded from
        `data_module/data_file_name`. For example `'SPX500.csv.gz'`.

    data_module : str or module, default='skfolio.datasets.data'
        Module where data lives. The default is `'skfolio.datasets.data'`.

    encoding : str, default="utf-8"
        Name of the encoding that the gzip-decompressed file will be
        decoded with. The default is 'utf-8'.

    datetime_index: bool, default=True
        If this is set to True, the DataFrame index is converted to datetime with
        format="%Y-%m-%d".
        The default is `True`.

    Returns
    -------
    df : DataFrame of shape (n_observations, n_assets)
        DataFrame with each row representing one observation and each column
        representing the asset price of a given observation.
    """
    path = resources.files(data_module).joinpath(data_filename)
    with path.open("rb") as compressed_file:
        compressed_file = gzip.open(compressed_file, mode="rt", encoding=encoding)
        df = pd.read_csv(compressed_file, sep=",", index_col=0)
        if datetime_index:
            df.index = pd.to_datetime(df.index, format="%Y-%m-%d")
        return df


def download_dataset(
    data_filename: str,
    data_home: str | Path | None = None,
    download_if_missing: bool = True,
) -> pd.DataFrame:
    """Download and save locally a dataset from the remote GitHub dataset folder.

    Parameters
    ----------
    data_filename : str
        Name of gzip-compressed csv file  (`'*.csv.gz'`) to be loaded from a remote
        GitHub dataset folder.

    data_home : str or path-like, optional
        Specify another download and cache folder for the datasets. By default,
        all skfolio data is stored in `~/skfolio_data` sub-folders.

    download_if_missing : bool, default=True
        If False, raise an OSError if the data is not locally available
        instead of trying to download the data from the source site.
        The default is `True`.

    Returns
    -------
    df : DataFrame of shape (n_observations, n_assets)
        DataFrame with each row representing one observation and each column
        representing the asset price of a given observation.
    """
    # Use a CORS proxy when triggering requests from the browser
    url_prefix = "https://corsproxy.io/?" if sys.platform == "emscripten" else ""
    url = url_prefix + (
        f"https://github.com/skfolio/skfolio-datasets/raw/main/"
        f"datasets/{data_filename}.csv.gz"
    )

    data_home = get_data_home(data_home=data_home)
    filepath = os.path.join(data_home, f"{data_filename}.pkz")

    if os.path.exists(filepath):
        return joblib.load(filepath)

    if not download_if_missing:
        raise OSError("Data not found and `download_if_missing` is False")

    archive_path = os.path.join(data_home, os.path.basename(url))
    ur.urlretrieve(url, archive_path)
    df = load_gzip_compressed_csv_data(archive_path)
    joblib.dump(df, filepath, compress=6)
    os.remove(archive_path)
    return df



[docs]
def load_sp500_dataset() -> pd.DataFrame:
    """Load the prices of 20 assets from the S&P 500 Index composition.

    This dataset is composed of the daily prices of 20 assets from the S&P 500
    composition starting from 1990-01-02 up to 2022-12-28.

    The data comes from the Yahoo public API.
    The price is the adjusted close which is the closing price after adjustments for
    all applicable splits and dividend distributions.
    The adjustment uses appropriate split and dividend multipliers, adhering to
    the Center for Research in Security Prices (CRSP) standards.

    ==============   ==================
    Observations     8313
    Assets           20
    ==============   ==================

    Returns
    -------
    df : DataFrame of shape (n_observations, n_assets)
        Prices DataFrame

    Examples
    --------
    >>> from skfolio.datasets import load_sp500_dataset
    >>> prices = load_sp500_dataset()
    >>> prices.head()
                    AAPL     AMD       BAC  ...       UNH       WMT      XOM
    1990-01-02  0.332589  4.1250  11.65625  ...  0.382813  5.890625  12.5000
    1990-01-03  0.334821  4.0000  11.75000  ...  0.375000  5.890625  12.3750
    1990-01-04  0.335938  3.9375  11.50000  ...  0.371094  5.859375  12.2500
    1990-01-05  0.337054  3.8125  11.25000  ...  0.355469  5.796875  12.1875
    1990-01-08  0.339286  3.8125  11.31250  ...  0.347656  5.875000  12.3750
    """
    data_filename = "sp500_dataset.csv.gz"
    df = load_gzip_compressed_csv_data(data_filename)
    return df




[docs]
def load_sp500_index() -> pd.DataFrame:
    """Load the prices of the S&P 500 Index.

    This dataset is composed of the daily prices of the S&P 500 Index starting from
    1990-01-02 up to 2022-12-28.

    The data comes from the Yahoo public API.
    The price is the adjusted close which is the closing price after adjustments for
    all applicable splits and dividend distributions.
    The adjustment uses appropriate split and dividend multipliers, adhering to
    the Center for Research in Security Prices (CRSP) standards.

    ==============   ==================
    Observations     8313
    Assets           1
    ==============   ==================

    Returns
    -------
    df : DataFrame of shape (n_observations, n_assets)
        Prices DataFrame

    Examples
    --------
    >>> from skfolio.datasets import load_sp500_index
    >>> prices = load_sp500_index()
    >>> prices.head()
                 SP500
    Date
    1990-01-02  359.69
    1990-01-03  358.76
    1990-01-04  355.67
    1990-01-05  352.20
    1990-01-08  353.79
    """
    data_filename = "sp500_index.csv.gz"
    df = load_gzip_compressed_csv_data(data_filename)
    return df




[docs]
def load_factors_dataset() -> pd.DataFrame:
    """Load the prices of 5 factor ETFs.

    This dataset is composed of the daily prices of 5 ETF representing common factors
    starting from 2014-01-02 up to 2022-12-28.

    The factors are:

        * "MTUM": Momentum
        * "QUAL": Quality
        * "SIZE": Size
        * "VLUE": Value
        * "USMV": low volatility

    The data comes from the Yahoo public API.
    The price is the adjusted close which is the closing price after adjustments for
    all applicable splits and dividend distributions.
    The adjustment uses appropriate split and dividend multipliers, adhering to
    the Center for Research in Security Prices (CRSP) standards.

    ==============   ==================
    Observations     2264
    Assets           5
    ==============   ==================

    Returns
    -------
    df : DataFrame of shape (n_observations, n_assets)
        Prices DataFrame

    Examples
    --------
    >>> from skfolio.datasets import load_factors_dataset
    >>> prices = load_factors_dataset()
    >>> prices.head()
                  MTUM    QUAL    SIZE    USMV    VLUE
    Date
    2014-01-02  52.704  48.351  48.986  29.338  47.054
    2014-01-03  52.792  48.256  48.722  29.330  46.999
    2014-01-06  52.677  48.067  48.722  29.263  46.991
    2014-01-07  53.112  48.455  48.731  29.430  47.253
    2014-01-08  53.502  48.437  48.731  29.422  47.253
    """
    data_filename = "factors_dataset.csv.gz"
    df = load_gzip_compressed_csv_data(data_filename)
    return df




[docs]
def load_ftse100_dataset(data_home=None, download_if_missing=True) -> pd.DataFrame:
    """Load the prices of 64 assets from the FTSE 100 Index composition.

    This dataset is composed of the daily prices of 64 assets from the FTSE 100 Index
    starting from 2000-01-04 up to 2023-05-31.

    The data comes from the Yahoo public API.
    The price is the adjusted close which is the closing price after adjustments for
    all applicable splits and dividend distributions.
    The adjustment uses appropriate split and dividend multipliers, adhering to
    the Center for Research in Security Prices (CRSP) standards.
    The data contains NaN.

    ==============   ==================
    Observations     5960
    Assets           64
    ==============   ==================

    Parameters
    ----------
    data_home : str, optional
        Specify another download and cache folder for the datasets.
        By default, all skfolio data is stored in `~/skfolio_data` subfolders.

    download_if_missing : bool, default=True
        If False, raise an OSError if the data is not locally available
        instead of trying to download the data from the source site.

    Returns
    -------
    df : DataFrame of shape (n_observations, n_assets)
        Prices DataFrame

    Examples
    --------
    >>> from skfolio.datasets import load_ftse100_dataset
    >>> prices = load_ftse100_dataset()
    >>> prices.head()
                  AAL.L    ABF.L   AHT.L  ANTO.L  ...   VOD.L   WEIR.L    WPP.L    WTB.L
    Date                                          ...
    2000-01-04  535.354  205.926  97.590  40.313  ...  72.562  115.240  512.249  382.907
    2000-01-05  540.039  209.185  96.729  40.313  ...  69.042  118.483  462.080  381.972
    2000-01-06  553.289  229.048  95.581  40.452  ...  66.950  124.220  458.119  386.337
    2000-01-07  572.829  222.220  95.581  40.452  ...  70.716  121.725  475.283  405.046
    2000-01-10  578.852  224.548  92.711  40.685  ...  74.285  121.476  498.254  392.885
    """
    data_filename = "ftse100_dataset"
    df = download_dataset(
        data_filename, data_home=data_home, download_if_missing=download_if_missing
    )
    return df




[docs]
def load_nasdaq_dataset(data_home=None, download_if_missing=True) -> pd.DataFrame:
    """Load the prices of 1455 assets from the NASDAQ Composite Index.

    This dataset is composed of the daily prices of 1455 assets from the NASDAQ
    Composite starting from 2018-01-02 up to 2023-05-31.

    The data comes from the Yahoo public API.
    The price is the adjusted close which is the closing price after adjustments for
    all applicable splits and dividend distributions.
    The adjustment uses appropriate split and dividend multipliers, adhering to
    the Center for Research in Security Prices (CRSP) standards.

    ==============   ==================
    Observations     1362
    Assets           1455
    ==============   ==================

    Parameters
    ----------
    data_home : str, optional
        Specify another download and cache folder for the datasets.
        By default, all skfolio data is stored in `~/skfolio_data` subfolders.

    download_if_missing : bool, default=True
        If False, raise an OSError if the data is not locally available
        instead of trying to download the data from the source site.

    Returns
    -------
    df : DataFrame of shape (n_observations, n_assets)
        Prices DataFrame

    Examples
    --------
    >>> from skfolio.datasets import load_nasdaq_dataset
    >>> prices = load_nasdaq_dataset()
    >>> prices.head()
                   AAL   AAOI    AAON    AAPL  ...  ZVRA   ZYME    ZYNE   ZYXI
    Date                                       ...
    2018-01-02  51.648  37.91  35.621  41.310  ...  66.4  7.933  12.995  2.922
    2018-01-03  51.014  37.89  36.247  41.303  ...  72.8  7.965  13.460  2.913
    2018-01-04  51.336  38.38  36.103  41.495  ...  78.4  8.430  12.700  2.869
    2018-01-05  51.316  38.89  36.681  41.967  ...  77.6  8.400  12.495  2.780
    2018-01-08  50.809  38.37  36.103  41.811  ...  82.4  8.310  12.550  2.825
    """
    data_filename = "nasdaq_dataset"
    df = download_dataset(
        data_filename, data_home=data_home, download_if_missing=download_if_missing
    )
    return df



def load_sp500_implied_vol_dataset(
    data_home=None, download_if_missing=True
) -> pd.DataFrame:
    """Load the 3 months ATM implied volatility of the 20 assets from the
    SP500 dataset.

    This dataset is composed of the 3 months ATM implied volatility of 20 assets
    from the S&P 500 composition starting from 2010-01-04 up to 2022-12-28.

    The data comes from the Yahoo public API option chains.

    ==============   ==================
    Observations     3270
    Assets           20
    ==============   ==================

    Parameters
    ----------
    data_home : str, optional
        Specify another download and cache folder for the datasets.
        By default, all skfolio data is stored in `~/skfolio_data` subfolders.

    download_if_missing : bool, default=True
        If False, raise an OSError if the data is not locally available
        instead of trying to download the data from the source site.

    Returns
    -------
    df : DataFrame of shape (n_observations, n_assets)
        Implied volatility DataFrame

    Examples
    --------
    >>> from skfolio.datasets import load_sp500_implied_vol_dataset
    >>> implied_vol = load_sp500_implied_vol_dataset()
    >>> implied_vol.head()
                    AAPL       AMD       BAC  ...       UNH       WMT       XOM
    Date                                      ...
    2010-01-04  0.364353  0.572056  0.382926  ...  0.362751  0.171737  0.201485
    2010-01-05  0.371865  0.568791  0.374699  ...  0.368504  0.174764  0.203852
    2010-01-06  0.356746  0.558054  0.349220  ...  0.368514  0.171892  0.197475
    2010-01-07  0.361084  0.560475  0.354942  ...  0.355792  0.169083  0.200046
    2010-01-08  0.348085  0.543932  0.360345  ...  0.351130  0.170897  0.204832
    """
    data_filename = "sp500_implied_vol_dataset"
    df = download_dataset(
        data_filename, data_home=data_home, download_if_missing=download_if_missing
    )
    return df