some new features
This commit is contained in:
426
.venv/lib/python3.12/site-packages/sklearn/datasets/_kddcup99.py
Normal file
426
.venv/lib/python3.12/site-packages/sklearn/datasets/_kddcup99.py
Normal file
@ -0,0 +1,426 @@
|
||||
"""KDDCUP 99 dataset.
|
||||
|
||||
A classic dataset for anomaly detection.
|
||||
|
||||
The dataset page is available from UCI Machine Learning Repository
|
||||
|
||||
https://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data.gz
|
||||
|
||||
"""
|
||||
|
||||
import errno
|
||||
import logging
|
||||
import os
|
||||
from gzip import GzipFile
|
||||
from numbers import Integral, Real
|
||||
from os.path import exists, join
|
||||
|
||||
import joblib
|
||||
import numpy as np
|
||||
|
||||
from ..utils import Bunch, check_random_state
|
||||
from ..utils import shuffle as shuffle_method
|
||||
from ..utils._param_validation import Interval, StrOptions, validate_params
|
||||
from . import get_data_home
|
||||
from ._base import (
|
||||
RemoteFileMetadata,
|
||||
_convert_data_dataframe,
|
||||
_fetch_remote,
|
||||
load_descr,
|
||||
)
|
||||
|
||||
# The original data can be found at:
|
||||
# https://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data.gz
|
||||
ARCHIVE = RemoteFileMetadata(
|
||||
filename="kddcup99_data",
|
||||
url="https://ndownloader.figshare.com/files/5976045",
|
||||
checksum="3b6c942aa0356c0ca35b7b595a26c89d343652c9db428893e7494f837b274292",
|
||||
)
|
||||
|
||||
# The original data can be found at:
|
||||
# https://archive.ics.uci.edu/ml/machine-learning-databases/kddcup99-mld/kddcup.data_10_percent.gz
|
||||
ARCHIVE_10_PERCENT = RemoteFileMetadata(
|
||||
filename="kddcup99_10_data",
|
||||
url="https://ndownloader.figshare.com/files/5976042",
|
||||
checksum="8045aca0d84e70e622d1148d7df782496f6333bf6eb979a1b0837c42a9fd9561",
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@validate_params(
|
||||
{
|
||||
"subset": [StrOptions({"SA", "SF", "http", "smtp"}), None],
|
||||
"data_home": [str, os.PathLike, None],
|
||||
"shuffle": ["boolean"],
|
||||
"random_state": ["random_state"],
|
||||
"percent10": ["boolean"],
|
||||
"download_if_missing": ["boolean"],
|
||||
"return_X_y": ["boolean"],
|
||||
"as_frame": ["boolean"],
|
||||
"n_retries": [Interval(Integral, 1, None, closed="left")],
|
||||
"delay": [Interval(Real, 0.0, None, closed="neither")],
|
||||
},
|
||||
prefer_skip_nested_validation=True,
|
||||
)
|
||||
def fetch_kddcup99(
|
||||
*,
|
||||
subset=None,
|
||||
data_home=None,
|
||||
shuffle=False,
|
||||
random_state=None,
|
||||
percent10=True,
|
||||
download_if_missing=True,
|
||||
return_X_y=False,
|
||||
as_frame=False,
|
||||
n_retries=3,
|
||||
delay=1.0,
|
||||
):
|
||||
"""Load the kddcup99 dataset (classification).
|
||||
|
||||
Download it if necessary.
|
||||
|
||||
================= ====================================
|
||||
Classes 23
|
||||
Samples total 4898431
|
||||
Dimensionality 41
|
||||
Features discrete (int) or continuous (float)
|
||||
================= ====================================
|
||||
|
||||
Read more in the :ref:`User Guide <kddcup99_dataset>`.
|
||||
|
||||
.. versionadded:: 0.18
|
||||
|
||||
Parameters
|
||||
----------
|
||||
subset : {'SA', 'SF', 'http', 'smtp'}, default=None
|
||||
To return the corresponding classical subsets of kddcup 99.
|
||||
If None, return the entire kddcup 99 dataset.
|
||||
|
||||
data_home : str or path-like, default=None
|
||||
Specify another download and cache folder for the datasets. By default
|
||||
all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
shuffle : bool, default=False
|
||||
Whether to shuffle dataset.
|
||||
|
||||
random_state : int, RandomState instance or None, default=None
|
||||
Determines random number generation for dataset shuffling and for
|
||||
selection of abnormal samples if `subset='SA'`. Pass an int for
|
||||
reproducible output across multiple function calls.
|
||||
See :term:`Glossary <random_state>`.
|
||||
|
||||
percent10 : bool, default=True
|
||||
Whether to load only 10 percent of the data.
|
||||
|
||||
download_if_missing : bool, default=True
|
||||
If False, raise an OSError if the data is not locally available
|
||||
instead of trying to download the data from the source site.
|
||||
|
||||
return_X_y : bool, default=False
|
||||
If True, returns ``(data, target)`` instead of a Bunch object. See
|
||||
below for more information about the `data` and `target` object.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
as_frame : bool, default=False
|
||||
If `True`, returns a pandas Dataframe for the ``data`` and ``target``
|
||||
objects in the `Bunch` returned object; `Bunch` return object will also
|
||||
have a ``frame`` member.
|
||||
|
||||
.. versionadded:: 0.24
|
||||
|
||||
n_retries : int, default=3
|
||||
Number of retries when HTTP errors are encountered.
|
||||
|
||||
.. versionadded:: 1.5
|
||||
|
||||
delay : float, default=1.0
|
||||
Number of seconds between retries.
|
||||
|
||||
.. versionadded:: 1.5
|
||||
|
||||
Returns
|
||||
-------
|
||||
data : :class:`~sklearn.utils.Bunch`
|
||||
Dictionary-like object, with the following attributes.
|
||||
|
||||
data : {ndarray, dataframe} of shape (494021, 41)
|
||||
The data matrix to learn. If `as_frame=True`, `data` will be a
|
||||
pandas DataFrame.
|
||||
target : {ndarray, series} of shape (494021,)
|
||||
The regression target for each sample. If `as_frame=True`, `target`
|
||||
will be a pandas Series.
|
||||
frame : dataframe of shape (494021, 42)
|
||||
Only present when `as_frame=True`. Contains `data` and `target`.
|
||||
DESCR : str
|
||||
The full description of the dataset.
|
||||
feature_names : list
|
||||
The names of the dataset columns
|
||||
target_names: list
|
||||
The names of the target columns
|
||||
|
||||
(data, target) : tuple if ``return_X_y`` is True
|
||||
A tuple of two ndarray. The first containing a 2D array of
|
||||
shape (n_samples, n_features) with each row representing one
|
||||
sample and each column representing the features. The second
|
||||
ndarray of shape (n_samples,) containing the target samples.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
"""
|
||||
data_home = get_data_home(data_home=data_home)
|
||||
kddcup99 = _fetch_brute_kddcup99(
|
||||
data_home=data_home,
|
||||
percent10=percent10,
|
||||
download_if_missing=download_if_missing,
|
||||
n_retries=n_retries,
|
||||
delay=delay,
|
||||
)
|
||||
|
||||
data = kddcup99.data
|
||||
target = kddcup99.target
|
||||
feature_names = kddcup99.feature_names
|
||||
target_names = kddcup99.target_names
|
||||
|
||||
if subset == "SA":
|
||||
s = target == b"normal."
|
||||
t = np.logical_not(s)
|
||||
normal_samples = data[s, :]
|
||||
normal_targets = target[s]
|
||||
abnormal_samples = data[t, :]
|
||||
abnormal_targets = target[t]
|
||||
|
||||
n_samples_abnormal = abnormal_samples.shape[0]
|
||||
# selected abnormal samples:
|
||||
random_state = check_random_state(random_state)
|
||||
r = random_state.randint(0, n_samples_abnormal, 3377)
|
||||
abnormal_samples = abnormal_samples[r]
|
||||
abnormal_targets = abnormal_targets[r]
|
||||
|
||||
data = np.r_[normal_samples, abnormal_samples]
|
||||
target = np.r_[normal_targets, abnormal_targets]
|
||||
|
||||
if subset == "SF" or subset == "http" or subset == "smtp":
|
||||
# select all samples with positive logged_in attribute:
|
||||
s = data[:, 11] == 1
|
||||
data = np.c_[data[s, :11], data[s, 12:]]
|
||||
feature_names = feature_names[:11] + feature_names[12:]
|
||||
target = target[s]
|
||||
|
||||
data[:, 0] = np.log((data[:, 0] + 0.1).astype(float, copy=False))
|
||||
data[:, 4] = np.log((data[:, 4] + 0.1).astype(float, copy=False))
|
||||
data[:, 5] = np.log((data[:, 5] + 0.1).astype(float, copy=False))
|
||||
|
||||
if subset == "http":
|
||||
s = data[:, 2] == b"http"
|
||||
data = data[s]
|
||||
target = target[s]
|
||||
data = np.c_[data[:, 0], data[:, 4], data[:, 5]]
|
||||
feature_names = [feature_names[0], feature_names[4], feature_names[5]]
|
||||
|
||||
if subset == "smtp":
|
||||
s = data[:, 2] == b"smtp"
|
||||
data = data[s]
|
||||
target = target[s]
|
||||
data = np.c_[data[:, 0], data[:, 4], data[:, 5]]
|
||||
feature_names = [feature_names[0], feature_names[4], feature_names[5]]
|
||||
|
||||
if subset == "SF":
|
||||
data = np.c_[data[:, 0], data[:, 2], data[:, 4], data[:, 5]]
|
||||
feature_names = [
|
||||
feature_names[0],
|
||||
feature_names[2],
|
||||
feature_names[4],
|
||||
feature_names[5],
|
||||
]
|
||||
|
||||
if shuffle:
|
||||
data, target = shuffle_method(data, target, random_state=random_state)
|
||||
|
||||
fdescr = load_descr("kddcup99.rst")
|
||||
|
||||
frame = None
|
||||
if as_frame:
|
||||
frame, data, target = _convert_data_dataframe(
|
||||
"fetch_kddcup99", data, target, feature_names, target_names
|
||||
)
|
||||
|
||||
if return_X_y:
|
||||
return data, target
|
||||
|
||||
return Bunch(
|
||||
data=data,
|
||||
target=target,
|
||||
frame=frame,
|
||||
target_names=target_names,
|
||||
feature_names=feature_names,
|
||||
DESCR=fdescr,
|
||||
)
|
||||
|
||||
|
||||
def _fetch_brute_kddcup99(
|
||||
data_home=None, download_if_missing=True, percent10=True, n_retries=3, delay=1.0
|
||||
):
|
||||
"""Load the kddcup99 dataset, downloading it if necessary.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data_home : str, default=None
|
||||
Specify another download and cache folder for the datasets. By default
|
||||
all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
|
||||
|
||||
download_if_missing : bool, default=True
|
||||
If False, raise an OSError if the data is not locally available
|
||||
instead of trying to download the data from the source site.
|
||||
|
||||
percent10 : bool, default=True
|
||||
Whether to load only 10 percent of the data.
|
||||
|
||||
n_retries : int, default=3
|
||||
Number of retries when HTTP errors are encountered.
|
||||
|
||||
delay : float, default=1.0
|
||||
Number of seconds between retries.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dataset : :class:`~sklearn.utils.Bunch`
|
||||
Dictionary-like object, with the following attributes.
|
||||
|
||||
data : ndarray of shape (494021, 41)
|
||||
Each row corresponds to the 41 features in the dataset.
|
||||
target : ndarray of shape (494021,)
|
||||
Each value corresponds to one of the 21 attack types or to the
|
||||
label 'normal.'.
|
||||
feature_names : list
|
||||
The names of the dataset columns
|
||||
target_names: list
|
||||
The names of the target columns
|
||||
DESCR : str
|
||||
Description of the kddcup99 dataset.
|
||||
|
||||
"""
|
||||
|
||||
data_home = get_data_home(data_home=data_home)
|
||||
dir_suffix = "-py3"
|
||||
|
||||
if percent10:
|
||||
kddcup_dir = join(data_home, "kddcup99_10" + dir_suffix)
|
||||
archive = ARCHIVE_10_PERCENT
|
||||
else:
|
||||
kddcup_dir = join(data_home, "kddcup99" + dir_suffix)
|
||||
archive = ARCHIVE
|
||||
|
||||
samples_path = join(kddcup_dir, "samples")
|
||||
targets_path = join(kddcup_dir, "targets")
|
||||
available = exists(samples_path)
|
||||
|
||||
dt = [
|
||||
("duration", int),
|
||||
("protocol_type", "S4"),
|
||||
("service", "S11"),
|
||||
("flag", "S6"),
|
||||
("src_bytes", int),
|
||||
("dst_bytes", int),
|
||||
("land", int),
|
||||
("wrong_fragment", int),
|
||||
("urgent", int),
|
||||
("hot", int),
|
||||
("num_failed_logins", int),
|
||||
("logged_in", int),
|
||||
("num_compromised", int),
|
||||
("root_shell", int),
|
||||
("su_attempted", int),
|
||||
("num_root", int),
|
||||
("num_file_creations", int),
|
||||
("num_shells", int),
|
||||
("num_access_files", int),
|
||||
("num_outbound_cmds", int),
|
||||
("is_host_login", int),
|
||||
("is_guest_login", int),
|
||||
("count", int),
|
||||
("srv_count", int),
|
||||
("serror_rate", float),
|
||||
("srv_serror_rate", float),
|
||||
("rerror_rate", float),
|
||||
("srv_rerror_rate", float),
|
||||
("same_srv_rate", float),
|
||||
("diff_srv_rate", float),
|
||||
("srv_diff_host_rate", float),
|
||||
("dst_host_count", int),
|
||||
("dst_host_srv_count", int),
|
||||
("dst_host_same_srv_rate", float),
|
||||
("dst_host_diff_srv_rate", float),
|
||||
("dst_host_same_src_port_rate", float),
|
||||
("dst_host_srv_diff_host_rate", float),
|
||||
("dst_host_serror_rate", float),
|
||||
("dst_host_srv_serror_rate", float),
|
||||
("dst_host_rerror_rate", float),
|
||||
("dst_host_srv_rerror_rate", float),
|
||||
("labels", "S16"),
|
||||
]
|
||||
|
||||
column_names = [c[0] for c in dt]
|
||||
target_names = column_names[-1]
|
||||
feature_names = column_names[:-1]
|
||||
|
||||
if available:
|
||||
try:
|
||||
X = joblib.load(samples_path)
|
||||
y = joblib.load(targets_path)
|
||||
except Exception as e:
|
||||
raise OSError(
|
||||
"The cache for fetch_kddcup99 is invalid, please delete "
|
||||
f"{str(kddcup_dir)} and run the fetch_kddcup99 again"
|
||||
) from e
|
||||
|
||||
elif download_if_missing:
|
||||
_mkdirp(kddcup_dir)
|
||||
logger.info("Downloading %s" % archive.url)
|
||||
_fetch_remote(archive, dirname=kddcup_dir, n_retries=n_retries, delay=delay)
|
||||
DT = np.dtype(dt)
|
||||
logger.debug("extracting archive")
|
||||
archive_path = join(kddcup_dir, archive.filename)
|
||||
file_ = GzipFile(filename=archive_path, mode="r")
|
||||
Xy = []
|
||||
for line in file_.readlines():
|
||||
line = line.decode()
|
||||
Xy.append(line.replace("\n", "").split(","))
|
||||
file_.close()
|
||||
logger.debug("extraction done")
|
||||
os.remove(archive_path)
|
||||
|
||||
Xy = np.asarray(Xy, dtype=object)
|
||||
for j in range(42):
|
||||
Xy[:, j] = Xy[:, j].astype(DT[j])
|
||||
|
||||
X = Xy[:, :-1]
|
||||
y = Xy[:, -1]
|
||||
# XXX bug when compress!=0:
|
||||
# (error: 'Incorrect data length while decompressing[...] the file
|
||||
# could be corrupted.')
|
||||
|
||||
joblib.dump(X, samples_path, compress=0)
|
||||
joblib.dump(y, targets_path, compress=0)
|
||||
else:
|
||||
raise OSError("Data not found and `download_if_missing` is False")
|
||||
|
||||
return Bunch(
|
||||
data=X,
|
||||
target=y,
|
||||
feature_names=feature_names,
|
||||
target_names=[target_names],
|
||||
)
|
||||
|
||||
|
||||
def _mkdirp(d):
|
||||
"""Ensure directory d exists (like mkdir -p on Unix)
|
||||
No guarantee that the directory is writable.
|
||||
"""
|
||||
try:
|
||||
os.makedirs(d)
|
||||
except OSError as e:
|
||||
if e.errno != errno.EEXIST:
|
||||
raise
|
||||
Reference in New Issue
Block a user