reconnect moved files to git repo

2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions
--- a/venv/lib/python3.11/site-packages/sklearn/datasets/init.py
+++ b/venv/lib/python3.11/site-packages/sklearn/datasets/init.py
@ -0,0 +1,161 @@
+"""Utilities to load popular datasets and artificial data generators."""
+
+import textwrap
+
+from ._base import (
+    clear_data_home,
+    get_data_home,
+    load_breast_cancer,
+    load_diabetes,
+    load_digits,
+    load_files,
+    load_iris,
+    load_linnerud,
+    load_sample_image,
+    load_sample_images,
+    load_wine,
+)
+from ._california_housing import fetch_california_housing
+from ._covtype import fetch_covtype
+from ._kddcup99 import fetch_kddcup99
+from ._lfw import fetch_lfw_pairs, fetch_lfw_people
+from ._olivetti_faces import fetch_olivetti_faces
+from ._openml import fetch_openml
+from ._rcv1 import fetch_rcv1
+from ._samples_generator import (
+    make_biclusters,
+    make_blobs,
+    make_checkerboard,
+    make_circles,
+    make_classification,
+    make_friedman1,
+    make_friedman2,
+    make_friedman3,
+    make_gaussian_quantiles,
+    make_hastie_10_2,
+    make_low_rank_matrix,
+    make_moons,
+    make_multilabel_classification,
+    make_regression,
+    make_s_curve,
+    make_sparse_coded_signal,
+    make_sparse_spd_matrix,
+    make_sparse_uncorrelated,
+    make_spd_matrix,
+    make_swiss_roll,
+)
+from ._species_distributions import fetch_species_distributions
+from ._svmlight_format_io import (
+    dump_svmlight_file,
+    load_svmlight_file,
+    load_svmlight_files,
+)
+from ._twenty_newsgroups import fetch_20newsgroups, fetch_20newsgroups_vectorized
+
+__all__ = [
+    "clear_data_home",
+    "dump_svmlight_file",
+    "fetch_20newsgroups",
+    "fetch_20newsgroups_vectorized",
+    "fetch_lfw_pairs",
+    "fetch_lfw_people",
+    "fetch_olivetti_faces",
+    "fetch_species_distributions",
+    "fetch_california_housing",
+    "fetch_covtype",
+    "fetch_rcv1",
+    "fetch_kddcup99",
+    "fetch_openml",
+    "get_data_home",
+    "load_diabetes",
+    "load_digits",
+    "load_files",
+    "load_iris",
+    "load_breast_cancer",
+    "load_linnerud",
+    "load_sample_image",
+    "load_sample_images",
+    "load_svmlight_file",
+    "load_svmlight_files",
+    "load_wine",
+    "make_biclusters",
+    "make_blobs",
+    "make_circles",
+    "make_classification",
+    "make_checkerboard",
+    "make_friedman1",
+    "make_friedman2",
+    "make_friedman3",
+    "make_gaussian_quantiles",
+    "make_hastie_10_2",
+    "make_low_rank_matrix",
+    "make_moons",
+    "make_multilabel_classification",
+    "make_regression",
+    "make_s_curve",
+    "make_sparse_coded_signal",
+    "make_sparse_spd_matrix",
+    "make_sparse_uncorrelated",
+    "make_spd_matrix",
+    "make_swiss_roll",
+]
+
+
+def __getattr__(name):
+    if name == "load_boston":
+        msg = textwrap.dedent(
+            """
+            `load_boston` has been removed from scikit-learn since version 1.2.
+
+            The Boston housing prices dataset has an ethical problem: as
+            investigated in [1], the authors of this dataset engineered a
+            non-invertible variable "B" assuming that racial self-segregation had a
+            positive impact on house prices [2]. Furthermore the goal of the
+            research that led to the creation of this dataset was to study the
+            impact of air quality but it did not give adequate demonstration of the
+            validity of this assumption.
+
+            The scikit-learn maintainers therefore strongly discourage the use of
+            this dataset unless the purpose of the code is to study and educate
+            about ethical issues in data science and machine learning.
+
+            In this special case, you can fetch the dataset from the original
+            source::
+
+                import pandas as pd
+                import numpy as np
+
+                data_url = "http://lib.stat.cmu.edu/datasets/boston"
+                raw_df = pd.read_csv(data_url, sep="\\s+", skiprows=22, header=None)
+                data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
+                target = raw_df.values[1::2, 2]
+
+            Alternative datasets include the California housing dataset and the
+            Ames housing dataset. You can load the datasets as follows::
+
+                from sklearn.datasets import fetch_california_housing
+                housing = fetch_california_housing()
+
+            for the California housing dataset and::
+
+                from sklearn.datasets import fetch_openml
+                housing = fetch_openml(name="house_prices", as_frame=True)
+
+            for the Ames housing dataset.
+
+            [1] M Carlisle.
+            "Racist data destruction?"
+            <https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>
+
+            [2] Harrison Jr, David, and Daniel L. Rubinfeld.
+            "Hedonic housing prices and the demand for clean air."
+            Journal of environmental economics and management 5.1 (1978): 81-102.
+            <https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>
+            """
+        )
+        raise ImportError(msg)
+    try:
+        return globals()[name]
+    except KeyError:
+        # This is turned into the appropriate ImportError
+        raise AttributeError