some new features

2025-07-30 17:09:11 +03:00
parent db5d46760a
commit 8019bd3b7c
20616 changed files with 4375466 additions and 8 deletions
--- a/.venv/lib/python3.12/site-packages/sklearn/datasets/_arff_parser.py
+++ b/.venv/lib/python3.12/site-packages/sklearn/datasets/_arff_parser.py
@ -0,0 +1,540 @@
+"""Implementation of ARFF parsers: via LIAC-ARFF and pandas."""
+
+import itertools
+import re
+from collections import OrderedDict
+from collections.abc import Generator
+from typing import List
+
+import numpy as np
+import scipy as sp
+
+from ..externals import _arff
+from ..externals._arff import ArffSparseDataType
+from ..utils._chunking import chunk_generator, get_chunk_n_rows
+from ..utils._optional_dependencies import check_pandas_support
+from ..utils.fixes import pd_fillna
+
+
+def _split_sparse_columns(
+    arff_data: ArffSparseDataType, include_columns: List
+) -> ArffSparseDataType:
+    """Obtains several columns from sparse ARFF representation. Additionally,
+    the column indices are re-labelled, given the columns that are not
+    included. (e.g., when including [1, 2, 3], the columns will be relabelled
+    to [0, 1, 2]).
+
+    Parameters
+    ----------
+    arff_data : tuple
+        A tuple of three lists of equal size; first list indicating the value,
+        second the x coordinate and the third the y coordinate.
+
+    include_columns : list
+        A list of columns to include.
+
+    Returns
+    -------
+    arff_data_new : tuple
+        Subset of arff data with only the include columns indicated by the
+        include_columns argument.
+    """
+    arff_data_new: ArffSparseDataType = (list(), list(), list())
+    reindexed_columns = {
+        column_idx: array_idx for array_idx, column_idx in enumerate(include_columns)
+    }
+    for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]):
+        if col_idx in include_columns:
+            arff_data_new[0].append(val)
+            arff_data_new[1].append(row_idx)
+            arff_data_new[2].append(reindexed_columns[col_idx])
+    return arff_data_new
+
+
+def _sparse_data_to_array(
+    arff_data: ArffSparseDataType, include_columns: List
+) -> np.ndarray:
+    # turns the sparse data back into an array (can't use toarray() function,
+    # as this does only work on numeric data)
+    num_obs = max(arff_data[1]) + 1
+    y_shape = (num_obs, len(include_columns))
+    reindexed_columns = {
+        column_idx: array_idx for array_idx, column_idx in enumerate(include_columns)
+    }
+    # TODO: improve for efficiency
+    y = np.empty(y_shape, dtype=np.float64)
+    for val, row_idx, col_idx in zip(arff_data[0], arff_data[1], arff_data[2]):
+        if col_idx in include_columns:
+            y[row_idx, reindexed_columns[col_idx]] = val
+    return y
+
+
+def _post_process_frame(frame, feature_names, target_names):
+    """Post process a dataframe to select the desired columns in `X` and `y`.
+
+    Parameters
+    ----------
+    frame : dataframe
+        The dataframe to split into `X` and `y`.
+
+    feature_names : list of str
+        The list of feature names to populate `X`.
+
+    target_names : list of str
+        The list of target names to populate `y`.
+
+    Returns
+    -------
+    X : dataframe
+        The dataframe containing the features.
+
+    y : {series, dataframe} or None
+        The series or dataframe containing the target.
+    """
+    X = frame[feature_names]
+    if len(target_names) >= 2:
+        y = frame[target_names]
+    elif len(target_names) == 1:
+        y = frame[target_names[0]]
+    else:
+        y = None
+    return X, y
+
+
+def _liac_arff_parser(
+    gzip_file,
+    output_arrays_type,
+    openml_columns_info,
+    feature_names_to_select,
+    target_names_to_select,
+    shape=None,
+):
+    """ARFF parser using the LIAC-ARFF library coded purely in Python.
+
+    This parser is quite slow but consumes a generator. Currently it is needed
+    to parse sparse datasets. For dense datasets, it is recommended to instead
+    use the pandas-based parser, although it does not always handles the
+    dtypes exactly the same.
+
+    Parameters
+    ----------
+    gzip_file : GzipFile instance
+        The file compressed to be read.
+
+    output_arrays_type : {"numpy", "sparse", "pandas"}
+        The type of the arrays that will be returned. The possibilities ara:
+
+        - `"numpy"`: both `X` and `y` will be NumPy arrays;
+        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
+        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
+          pandas Series or DataFrame.
+
+    columns_info : dict
+        The information provided by OpenML regarding the columns of the ARFF
+        file.
+
+    feature_names_to_select : list of str
+        A list of the feature names to be selected.
+
+    target_names_to_select : list of str
+        A list of the target names to be selected.
+
+    Returns
+    -------
+    X : {ndarray, sparse matrix, dataframe}
+        The data matrix.
+
+    y : {ndarray, dataframe, series}
+        The target.
+
+    frame : dataframe or None
+        A dataframe containing both `X` and `y`. `None` if
+        `output_array_type != "pandas"`.
+
+    categories : list of str or None
+        The names of the features that are categorical. `None` if
+        `output_array_type == "pandas"`.
+    """
+
+    def _io_to_generator(gzip_file):
+        for line in gzip_file:
+            yield line.decode("utf-8")
+
+    stream = _io_to_generator(gzip_file)
+
+    # find which type (dense or sparse) ARFF type we will have to deal with
+    return_type = _arff.COO if output_arrays_type == "sparse" else _arff.DENSE_GEN
+    # we should not let LIAC-ARFF to encode the nominal attributes with NumPy
+    # arrays to have only numerical values.
+    encode_nominal = not (output_arrays_type == "pandas")
+    arff_container = _arff.load(
+        stream, return_type=return_type, encode_nominal=encode_nominal
+    )
+    columns_to_select = feature_names_to_select + target_names_to_select
+
+    categories = {
+        name: cat
+        for name, cat in arff_container["attributes"]
+        if isinstance(cat, list) and name in columns_to_select
+    }
+    if output_arrays_type == "pandas":
+        pd = check_pandas_support("fetch_openml with as_frame=True")
+
+        columns_info = OrderedDict(arff_container["attributes"])
+        columns_names = list(columns_info.keys())
+
+        # calculate chunksize
+        first_row = next(arff_container["data"])
+        first_df = pd.DataFrame([first_row], columns=columns_names, copy=False)
+
+        row_bytes = first_df.memory_usage(deep=True).sum()
+        chunksize = get_chunk_n_rows(row_bytes)
+
+        # read arff data with chunks
+        columns_to_keep = [col for col in columns_names if col in columns_to_select]
+        dfs = [first_df[columns_to_keep]]
+        for data in chunk_generator(arff_container["data"], chunksize):
+            dfs.append(
+                pd.DataFrame(data, columns=columns_names, copy=False)[columns_to_keep]
+            )
+        # dfs[0] contains only one row, which may not have enough data to infer to
+        # column's dtype. Here we use `dfs[1]` to configure the dtype in dfs[0]
+        if len(dfs) >= 2:
+            dfs[0] = dfs[0].astype(dfs[1].dtypes)
+
+        # liac-arff parser does not depend on NumPy and uses None to represent
+        # missing values. To be consistent with the pandas parser, we replace
+        # None with np.nan.
+        frame = pd.concat(dfs, ignore_index=True)
+        frame = pd_fillna(pd, frame)
+        del dfs, first_df
+
+        # cast the columns frame
+        dtypes = {}
+        for name in frame.columns:
+            column_dtype = openml_columns_info[name]["data_type"]
+            if column_dtype.lower() == "integer":
+                # Use a pandas extension array instead of np.int64 to be able
+                # to support missing values.
+                dtypes[name] = "Int64"
+            elif column_dtype.lower() == "nominal":
+                dtypes[name] = "category"
+            else:
+                dtypes[name] = frame.dtypes[name]
+        frame = frame.astype(dtypes)
+
+        X, y = _post_process_frame(
+            frame, feature_names_to_select, target_names_to_select
+        )
+    else:
+        arff_data = arff_container["data"]
+
+        feature_indices_to_select = [
+            int(openml_columns_info[col_name]["index"])
+            for col_name in feature_names_to_select
+        ]
+        target_indices_to_select = [
+            int(openml_columns_info[col_name]["index"])
+            for col_name in target_names_to_select
+        ]
+
+        if isinstance(arff_data, Generator):
+            if shape is None:
+                raise ValueError(
+                    "shape must be provided when arr['data'] is a Generator"
+                )
+            if shape[0] == -1:
+                count = -1
+            else:
+                count = shape[0] * shape[1]
+            data = np.fromiter(
+                itertools.chain.from_iterable(arff_data),
+                dtype="float64",
+                count=count,
+            )
+            data = data.reshape(*shape)
+            X = data[:, feature_indices_to_select]
+            y = data[:, target_indices_to_select]
+        elif isinstance(arff_data, tuple):
+            arff_data_X = _split_sparse_columns(arff_data, feature_indices_to_select)
+            num_obs = max(arff_data[1]) + 1
+            X_shape = (num_obs, len(feature_indices_to_select))
+            X = sp.sparse.coo_matrix(
+                (arff_data_X[0], (arff_data_X[1], arff_data_X[2])),
+                shape=X_shape,
+                dtype=np.float64,
+            )
+            X = X.tocsr()
+            y = _sparse_data_to_array(arff_data, target_indices_to_select)
+        else:
+            # This should never happen
+            raise ValueError(
+                f"Unexpected type for data obtained from arff: {type(arff_data)}"
+            )
+
+        is_classification = {
+            col_name in categories for col_name in target_names_to_select
+        }
+        if not is_classification:
+            # No target
+            pass
+        elif all(is_classification):
+            y = np.hstack(
+                [
+                    np.take(
+                        np.asarray(categories.pop(col_name), dtype="O"),
+                        y[:, i : i + 1].astype(int, copy=False),
+                    )
+                    for i, col_name in enumerate(target_names_to_select)
+                ]
+            )
+        elif any(is_classification):
+            raise ValueError(
+                "Mix of nominal and non-nominal targets is not currently supported"
+            )
+
+        # reshape y back to 1-D array, if there is only 1 target column;
+        # back to None if there are not target columns
+        if y.shape[1] == 1:
+            y = y.reshape((-1,))
+        elif y.shape[1] == 0:
+            y = None
+
+    if output_arrays_type == "pandas":
+        return X, y, frame, None
+    return X, y, None, categories
+
+
+def _pandas_arff_parser(
+    gzip_file,
+    output_arrays_type,
+    openml_columns_info,
+    feature_names_to_select,
+    target_names_to_select,
+    read_csv_kwargs=None,
+):
+    """ARFF parser using `pandas.read_csv`.
+
+    This parser uses the metadata fetched directly from OpenML and skips the metadata
+    headers of ARFF file itself. The data is loaded as a CSV file.
+
+    Parameters
+    ----------
+    gzip_file : GzipFile instance
+        The GZip compressed file with the ARFF formatted payload.
+
+    output_arrays_type : {"numpy", "sparse", "pandas"}
+        The type of the arrays that will be returned. The possibilities are:
+
+        - `"numpy"`: both `X` and `y` will be NumPy arrays;
+        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
+        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
+          pandas Series or DataFrame.
+
+    openml_columns_info : dict
+        The information provided by OpenML regarding the columns of the ARFF
+        file.
+
+    feature_names_to_select : list of str
+        A list of the feature names to be selected to build `X`.
+
+    target_names_to_select : list of str
+        A list of the target names to be selected to build `y`.
+
+    read_csv_kwargs : dict, default=None
+        Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
+        the default options.
+
+    Returns
+    -------
+    X : {ndarray, sparse matrix, dataframe}
+        The data matrix.
+
+    y : {ndarray, dataframe, series}
+        The target.
+
+    frame : dataframe or None
+        A dataframe containing both `X` and `y`. `None` if
+        `output_array_type != "pandas"`.
+
+    categories : list of str or None
+        The names of the features that are categorical. `None` if
+        `output_array_type == "pandas"`.
+    """
+    import pandas as pd
+
+    # read the file until the data section to skip the ARFF metadata headers
+    for line in gzip_file:
+        if line.decode("utf-8").lower().startswith("@data"):
+            break
+
+    dtypes = {}
+    for name in openml_columns_info:
+        column_dtype = openml_columns_info[name]["data_type"]
+        if column_dtype.lower() == "integer":
+            # Use Int64 to infer missing values from data
+            # XXX: this line is not covered by our tests. Is this really needed?
+            dtypes[name] = "Int64"
+        elif column_dtype.lower() == "nominal":
+            dtypes[name] = "category"
+    # since we will not pass `names` when reading the ARFF file, we need to translate
+    # `dtypes` from column names to column indices to pass to `pandas.read_csv`
+    dtypes_positional = {
+        col_idx: dtypes[name]
+        for col_idx, name in enumerate(openml_columns_info)
+        if name in dtypes
+    }
+
+    default_read_csv_kwargs = {
+        "header": None,
+        "index_col": False,  # always force pandas to not use the first column as index
+        "na_values": ["?"],  # missing values are represented by `?`
+        "keep_default_na": False,  # only `?` is a missing value given the ARFF specs
+        "comment": "%",  # skip line starting by `%` since they are comments
+        "quotechar": '"',  # delimiter to use for quoted strings
+        "skipinitialspace": True,  # skip spaces after delimiter to follow ARFF specs
+        "escapechar": "\\",
+        "dtype": dtypes_positional,
+    }
+    read_csv_kwargs = {**default_read_csv_kwargs, **(read_csv_kwargs or {})}
+    frame = pd.read_csv(gzip_file, **read_csv_kwargs)
+    try:
+        # Setting the columns while reading the file will select the N first columns
+        # and not raise a ParserError. Instead, we set the columns after reading the
+        # file and raise a ParserError if the number of columns does not match the
+        # number of columns in the metadata given by OpenML.
+        frame.columns = [name for name in openml_columns_info]
+    except ValueError as exc:
+        raise pd.errors.ParserError(
+            "The number of columns provided by OpenML does not match the number of "
+            "columns inferred by pandas when reading the file."
+        ) from exc
+
+    columns_to_select = feature_names_to_select + target_names_to_select
+    columns_to_keep = [col for col in frame.columns if col in columns_to_select]
+    frame = frame[columns_to_keep]
+
+    # `pd.read_csv` automatically handles double quotes for quoting non-numeric
+    # CSV cell values. Contrary to LIAC-ARFF, `pd.read_csv` cannot be configured to
+    # consider either single quotes and double quotes as valid quoting chars at
+    # the same time since this case does not occur in regular (non-ARFF) CSV files.
+    # To mimic the behavior of LIAC-ARFF parser, we manually strip single quotes
+    # on categories as a post-processing steps if needed.
+    #
+    # Note however that we intentionally do not attempt to do this kind of manual
+    # post-processing of (non-categorical) string-typed columns because we cannot
+    # resolve the ambiguity of the case of CSV cell with nesting quoting such as
+    # `"'some string value'"` with pandas.
+    single_quote_pattern = re.compile(r"^'(?P<contents>.*)'$")
+
+    def strip_single_quotes(input_string):
+        match = re.search(single_quote_pattern, input_string)
+        if match is None:
+            return input_string
+
+        return match.group("contents")
+
+    categorical_columns = [
+        name
+        for name, dtype in frame.dtypes.items()
+        if isinstance(dtype, pd.CategoricalDtype)
+    ]
+    for col in categorical_columns:
+        frame[col] = frame[col].cat.rename_categories(strip_single_quotes)
+
+    X, y = _post_process_frame(frame, feature_names_to_select, target_names_to_select)
+
+    if output_arrays_type == "pandas":
+        return X, y, frame, None
+    else:
+        X, y = X.to_numpy(), y.to_numpy()
+
+    categories = {
+        name: dtype.categories.tolist()
+        for name, dtype in frame.dtypes.items()
+        if isinstance(dtype, pd.CategoricalDtype)
+    }
+    return X, y, None, categories
+
+
+def load_arff_from_gzip_file(
+    gzip_file,
+    parser,
+    output_type,
+    openml_columns_info,
+    feature_names_to_select,
+    target_names_to_select,
+    shape=None,
+    read_csv_kwargs=None,
+):
+    """Load a compressed ARFF file using a given parser.
+
+    Parameters
+    ----------
+    gzip_file : GzipFile instance
+        The file compressed to be read.
+
+    parser : {"pandas", "liac-arff"}
+        The parser used to parse the ARFF file. "pandas" is recommended
+        but only supports loading dense datasets.
+
+    output_type : {"numpy", "sparse", "pandas"}
+        The type of the arrays that will be returned. The possibilities ara:
+
+        - `"numpy"`: both `X` and `y` will be NumPy arrays;
+        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
+        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
+          pandas Series or DataFrame.
+
+    openml_columns_info : dict
+        The information provided by OpenML regarding the columns of the ARFF
+        file.
+
+    feature_names_to_select : list of str
+        A list of the feature names to be selected.
+
+    target_names_to_select : list of str
+        A list of the target names to be selected.
+
+    read_csv_kwargs : dict, default=None
+        Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
+        the default options.
+
+    Returns
+    -------
+    X : {ndarray, sparse matrix, dataframe}
+        The data matrix.
+
+    y : {ndarray, dataframe, series}
+        The target.
+
+    frame : dataframe or None
+        A dataframe containing both `X` and `y`. `None` if
+        `output_array_type != "pandas"`.
+
+    categories : list of str or None
+        The names of the features that are categorical. `None` if
+        `output_array_type == "pandas"`.
+    """
+    if parser == "liac-arff":
+        return _liac_arff_parser(
+            gzip_file,
+            output_type,
+            openml_columns_info,
+            feature_names_to_select,
+            target_names_to_select,
+            shape,
+        )
+    elif parser == "pandas":
+        return _pandas_arff_parser(
+            gzip_file,
+            output_type,
+            openml_columns_info,
+            feature_names_to_select,
+            target_names_to_select,
+            read_csv_kwargs,
+        )
+    else:
+        raise ValueError(
+            f"Unknown parser: '{parser}'. Should be 'liac-arff' or 'pandas'."
+        )