reconnect moved files to git repo

2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions
--- a/venv/lib/python3.11/site-packages/sklearn/decomposition/_lda.py
+++ b/venv/lib/python3.11/site-packages/sklearn/decomposition/_lda.py
@ -0,0 +1,927 @@
+"""
+
+=============================================================
+Online Latent Dirichlet Allocation with variational inference
+=============================================================
+
+This implementation is modified from Matthew D. Hoffman's onlineldavb code
+Link: https://github.com/blei-lab/onlineldavb
+"""
+
+# Author: Chyi-Kwei Yau
+# Author: Matthew D. Hoffman (original onlineldavb implementation)
+from numbers import Integral, Real
+
+import numpy as np
+import scipy.sparse as sp
+from joblib import effective_n_jobs
+from scipy.special import gammaln, logsumexp
+
+from ..base import (
+    BaseEstimator,
+    ClassNamePrefixFeaturesOutMixin,
+    TransformerMixin,
+    _fit_context,
+)
+from ..utils import check_random_state, gen_batches, gen_even_slices
+from ..utils._param_validation import Interval, StrOptions
+from ..utils.parallel import Parallel, delayed
+from ..utils.validation import check_is_fitted, check_non_negative
+from ._online_lda_fast import (
+    _dirichlet_expectation_1d as cy_dirichlet_expectation_1d,
+)
+from ._online_lda_fast import (
+    _dirichlet_expectation_2d,
+)
+from ._online_lda_fast import (
+    mean_change as cy_mean_change,
+)
+
+EPS = np.finfo(float).eps
+
+
+def _update_doc_distribution(
+    X,
+    exp_topic_word_distr,
+    doc_topic_prior,
+    max_doc_update_iter,
+    mean_change_tol,
+    cal_sstats,
+    random_state,
+):
+    """E-step: update document-topic distribution.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        Document word matrix.
+
+    exp_topic_word_distr : ndarray of shape (n_topics, n_features)
+        Exponential value of expectation of log topic word distribution.
+        In the literature, this is `exp(E[log(beta)])`.
+
+    doc_topic_prior : float
+        Prior of document topic distribution `theta`.
+
+    max_doc_update_iter : int
+        Max number of iterations for updating document topic distribution in
+        the E-step.
+
+    mean_change_tol : float
+        Stopping tolerance for updating document topic distribution in E-step.
+
+    cal_sstats : bool
+        Parameter that indicate to calculate sufficient statistics or not.
+        Set `cal_sstats` to `True` when we need to run M-step.
+
+    random_state : RandomState instance or None
+        Parameter that indicate how to initialize document topic distribution.
+        Set `random_state` to None will initialize document topic distribution
+        to a constant number.
+
+    Returns
+    -------
+    (doc_topic_distr, suff_stats) :
+        `doc_topic_distr` is unnormalized topic distribution for each document.
+        In the literature, this is `gamma`. we can calculate `E[log(theta)]`
+        from it.
+        `suff_stats` is expected sufficient statistics for the M-step.
+            When `cal_sstats == False`, this will be None.
+
+    """
+    is_sparse_x = sp.issparse(X)
+    n_samples, n_features = X.shape
+    n_topics = exp_topic_word_distr.shape[0]
+
+    if random_state:
+        doc_topic_distr = random_state.gamma(100.0, 0.01, (n_samples, n_topics)).astype(
+            X.dtype, copy=False
+        )
+    else:
+        doc_topic_distr = np.ones((n_samples, n_topics), dtype=X.dtype)
+
+    # In the literature, this is `exp(E[log(theta)])`
+    exp_doc_topic = np.exp(_dirichlet_expectation_2d(doc_topic_distr))
+
+    # diff on `component_` (only calculate it when `cal_diff` is True)
+    suff_stats = (
+        np.zeros(exp_topic_word_distr.shape, dtype=X.dtype) if cal_sstats else None
+    )
+
+    if is_sparse_x:
+        X_data = X.data
+        X_indices = X.indices
+        X_indptr = X.indptr
+
+    # These cython functions are called in a nested loop on usually very small arrays
+    # (length=n_topics). In that case, finding the appropriate signature of the
+    # fused-typed function can be more costly than its execution, hence the dispatch
+    # is done outside of the loop.
+    ctype = "float" if X.dtype == np.float32 else "double"
+    mean_change = cy_mean_change[ctype]
+    dirichlet_expectation_1d = cy_dirichlet_expectation_1d[ctype]
+    eps = np.finfo(X.dtype).eps
+
+    for idx_d in range(n_samples):
+        if is_sparse_x:
+            ids = X_indices[X_indptr[idx_d] : X_indptr[idx_d + 1]]
+            cnts = X_data[X_indptr[idx_d] : X_indptr[idx_d + 1]]
+        else:
+            ids = np.nonzero(X[idx_d, :])[0]
+            cnts = X[idx_d, ids]
+
+        doc_topic_d = doc_topic_distr[idx_d, :]
+        # The next one is a copy, since the inner loop overwrites it.
+        exp_doc_topic_d = exp_doc_topic[idx_d, :].copy()
+        exp_topic_word_d = exp_topic_word_distr[:, ids]
+
+        # Iterate between `doc_topic_d` and `norm_phi` until convergence
+        for _ in range(0, max_doc_update_iter):
+            last_d = doc_topic_d
+
+            # The optimal phi_{dwk} is proportional to
+            # exp(E[log(theta_{dk})]) * exp(E[log(beta_{dw})]).
+            norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + eps
+
+            doc_topic_d = exp_doc_topic_d * np.dot(cnts / norm_phi, exp_topic_word_d.T)
+            # Note: adds doc_topic_prior to doc_topic_d, in-place.
+            dirichlet_expectation_1d(doc_topic_d, doc_topic_prior, exp_doc_topic_d)
+
+            if mean_change(last_d, doc_topic_d) < mean_change_tol:
+                break
+        doc_topic_distr[idx_d, :] = doc_topic_d
+
+        # Contribution of document d to the expected sufficient
+        # statistics for the M step.
+        if cal_sstats:
+            norm_phi = np.dot(exp_doc_topic_d, exp_topic_word_d) + eps
+            suff_stats[:, ids] += np.outer(exp_doc_topic_d, cnts / norm_phi)
+
+    return (doc_topic_distr, suff_stats)
+
+
+class LatentDirichletAllocation(
+    ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator
+):
+    """Latent Dirichlet Allocation with online variational Bayes algorithm.
+
+    The implementation is based on [1]_ and [2]_.
+
+    .. versionadded:: 0.17
+
+    Read more in the :ref:`User Guide <LatentDirichletAllocation>`.
+
+    Parameters
+    ----------
+    n_components : int, default=10
+        Number of topics.
+
+        .. versionchanged:: 0.19
+            ``n_topics`` was renamed to ``n_components``
+
+    doc_topic_prior : float, default=None
+        Prior of document topic distribution `theta`. If the value is None,
+        defaults to `1 / n_components`.
+        In [1]_, this is called `alpha`.
+
+    topic_word_prior : float, default=None
+        Prior of topic word distribution `beta`. If the value is None, defaults
+        to `1 / n_components`.
+        In [1]_, this is called `eta`.
+
+    learning_method : {'batch', 'online'}, default='batch'
+        Method used to update `_component`. Only used in :meth:`fit` method.
+        In general, if the data size is large, the online update will be much
+        faster than the batch update.
+
+        Valid options:
+
+        - 'batch': Batch variational Bayes method. Use all training data in each EM
+          update. Old `components_` will be overwritten in each iteration.
+        - 'online': Online variational Bayes method. In each EM update, use mini-batch
+          of training data to update the ``components_`` variable incrementally. The
+          learning rate is controlled by the ``learning_decay`` and the
+          ``learning_offset`` parameters.
+
+        .. versionchanged:: 0.20
+            The default learning method is now ``"batch"``.
+
+    learning_decay : float, default=0.7
+        It is a parameter that control learning rate in the online learning
+        method. The value should be set between (0.5, 1.0] to guarantee
+        asymptotic convergence. When the value is 0.0 and batch_size is
+        ``n_samples``, the update method is same as batch learning. In the
+        literature, this is called kappa.
+
+    learning_offset : float, default=10.0
+        A (positive) parameter that downweights early iterations in online
+        learning.  It should be greater than 1.0. In the literature, this is
+        called tau_0.
+
+    max_iter : int, default=10
+        The maximum number of passes over the training data (aka epochs).
+        It only impacts the behavior in the :meth:`fit` method, and not the
+        :meth:`partial_fit` method.
+
+    batch_size : int, default=128
+        Number of documents to use in each EM iteration. Only used in online
+        learning.
+
+    evaluate_every : int, default=-1
+        How often to evaluate perplexity. Only used in `fit` method.
+        set it to 0 or negative number to not evaluate perplexity in
+        training at all. Evaluating perplexity can help you check convergence
+        in training process, but it will also increase total training time.
+        Evaluating perplexity in every iteration might increase training time
+        up to two-fold.
+
+    total_samples : int, default=1e6
+        Total number of documents. Only used in the :meth:`partial_fit` method.
+
+    perp_tol : float, default=1e-1
+        Perplexity tolerance. Only used when ``evaluate_every`` is greater than 0.
+
+    mean_change_tol : float, default=1e-3
+        Stopping tolerance for updating document topic distribution in E-step.
+
+    max_doc_update_iter : int, default=100
+        Max number of iterations for updating document topic distribution in
+        the E-step.
+
+    n_jobs : int, default=None
+        The number of jobs to use in the E-step.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    verbose : int, default=0
+        Verbosity level.
+
+    random_state : int, RandomState instance or None, default=None
+        Pass an int for reproducible results across multiple function calls.
+        See :term:`Glossary <random_state>`.
+
+    Attributes
+    ----------
+    components_ : ndarray of shape (n_components, n_features)
+        Variational parameters for topic word distribution. Since the complete
+        conditional for topic word distribution is a Dirichlet,
+        ``components_[i, j]`` can be viewed as pseudocount that represents the
+        number of times word `j` was assigned to topic `i`.
+        It can also be viewed as distribution over the words for each topic
+        after normalization:
+        ``model.components_ / model.components_.sum(axis=1)[:, np.newaxis]``.
+
+    exp_dirichlet_component_ : ndarray of shape (n_components, n_features)
+        Exponential value of expectation of log topic word distribution.
+        In the literature, this is `exp(E[log(beta)])`.
+
+    n_batch_iter_ : int
+        Number of iterations of the EM step.
+
+    n_features_in_ : int
+        Number of features seen during :term:`fit`.
+
+        .. versionadded:: 0.24
+
+    feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Names of features seen during :term:`fit`. Defined only when `X`
+        has feature names that are all strings.
+
+        .. versionadded:: 1.0
+
+    n_iter_ : int
+        Number of passes over the dataset.
+
+    bound_ : float
+        Final perplexity score on training set.
+
+    doc_topic_prior_ : float
+        Prior of document topic distribution `theta`. If the value is None,
+        it is `1 / n_components`.
+
+    random_state_ : RandomState instance
+        RandomState instance that is generated either from a seed, the random
+        number generator or by `np.random`.
+
+    topic_word_prior_ : float
+        Prior of topic word distribution `beta`. If the value is None, it is
+        `1 / n_components`.
+
+    See Also
+    --------
+    sklearn.discriminant_analysis.LinearDiscriminantAnalysis:
+        A classifier with a linear decision boundary, generated by fitting
+        class conditional densities to the data and using Bayes' rule.
+
+    References
+    ----------
+    .. [1] "Online Learning for Latent Dirichlet Allocation", Matthew D.
+           Hoffman, David M. Blei, Francis Bach, 2010
+           https://github.com/blei-lab/onlineldavb
+
+    .. [2] "Stochastic Variational Inference", Matthew D. Hoffman,
+           David M. Blei, Chong Wang, John Paisley, 2013
+
+    Examples
+    --------
+    >>> from sklearn.decomposition import LatentDirichletAllocation
+    >>> from sklearn.datasets import make_multilabel_classification
+    >>> # This produces a feature matrix of token counts, similar to what
+    >>> # CountVectorizer would produce on text.
+    >>> X, _ = make_multilabel_classification(random_state=0)
+    >>> lda = LatentDirichletAllocation(n_components=5,
+    ...     random_state=0)
+    >>> lda.fit(X)
+    LatentDirichletAllocation(...)
+    >>> # get topics for some given samples:
+    >>> lda.transform(X[-2:])
+    array([[0.00360392, 0.25499205, 0.0036211 , 0.64236448, 0.09541846],
+           [0.15297572, 0.00362644, 0.44412786, 0.39568399, 0.003586  ]])
+    """
+
+    _parameter_constraints: dict = {
+        "n_components": [Interval(Integral, 0, None, closed="neither")],
+        "doc_topic_prior": [None, Interval(Real, 0, 1, closed="both")],
+        "topic_word_prior": [None, Interval(Real, 0, 1, closed="both")],
+        "learning_method": [StrOptions({"batch", "online"})],
+        "learning_decay": [Interval(Real, 0, 1, closed="both")],
+        "learning_offset": [Interval(Real, 1.0, None, closed="left")],
+        "max_iter": [Interval(Integral, 0, None, closed="left")],
+        "batch_size": [Interval(Integral, 0, None, closed="neither")],
+        "evaluate_every": [Interval(Integral, None, None, closed="neither")],
+        "total_samples": [Interval(Real, 0, None, closed="neither")],
+        "perp_tol": [Interval(Real, 0, None, closed="left")],
+        "mean_change_tol": [Interval(Real, 0, None, closed="left")],
+        "max_doc_update_iter": [Interval(Integral, 0, None, closed="left")],
+        "n_jobs": [None, Integral],
+        "verbose": ["verbose"],
+        "random_state": ["random_state"],
+    }
+
+    def __init__(
+        self,
+        n_components=10,
+        *,
+        doc_topic_prior=None,
+        topic_word_prior=None,
+        learning_method="batch",
+        learning_decay=0.7,
+        learning_offset=10.0,
+        max_iter=10,
+        batch_size=128,
+        evaluate_every=-1,
+        total_samples=1e6,
+        perp_tol=1e-1,
+        mean_change_tol=1e-3,
+        max_doc_update_iter=100,
+        n_jobs=None,
+        verbose=0,
+        random_state=None,
+    ):
+        self.n_components = n_components
+        self.doc_topic_prior = doc_topic_prior
+        self.topic_word_prior = topic_word_prior
+        self.learning_method = learning_method
+        self.learning_decay = learning_decay
+        self.learning_offset = learning_offset
+        self.max_iter = max_iter
+        self.batch_size = batch_size
+        self.evaluate_every = evaluate_every
+        self.total_samples = total_samples
+        self.perp_tol = perp_tol
+        self.mean_change_tol = mean_change_tol
+        self.max_doc_update_iter = max_doc_update_iter
+        self.n_jobs = n_jobs
+        self.verbose = verbose
+        self.random_state = random_state
+
+    def _init_latent_vars(self, n_features, dtype=np.float64):
+        """Initialize latent variables."""
+
+        self.random_state_ = check_random_state(self.random_state)
+        self.n_batch_iter_ = 1
+        self.n_iter_ = 0
+
+        if self.doc_topic_prior is None:
+            self.doc_topic_prior_ = 1.0 / self.n_components
+        else:
+            self.doc_topic_prior_ = self.doc_topic_prior
+
+        if self.topic_word_prior is None:
+            self.topic_word_prior_ = 1.0 / self.n_components
+        else:
+            self.topic_word_prior_ = self.topic_word_prior
+
+        init_gamma = 100.0
+        init_var = 1.0 / init_gamma
+        # In the literature, this is called `lambda`
+        self.components_ = self.random_state_.gamma(
+            init_gamma, init_var, (self.n_components, n_features)
+        ).astype(dtype, copy=False)
+
+        # In the literature, this is `exp(E[log(beta)])`
+        self.exp_dirichlet_component_ = np.exp(
+            _dirichlet_expectation_2d(self.components_)
+        )
+
+    def _e_step(self, X, cal_sstats, random_init, parallel=None):
+        """E-step in EM update.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Document word matrix.
+
+        cal_sstats : bool
+            Parameter that indicate whether to calculate sufficient statistics
+            or not. Set ``cal_sstats`` to True when we need to run M-step.
+
+        random_init : bool
+            Parameter that indicate whether to initialize document topic
+            distribution randomly in the E-step. Set it to True in training
+            steps.
+
+        parallel : joblib.Parallel, default=None
+            Pre-initialized instance of joblib.Parallel.
+
+        Returns
+        -------
+        (doc_topic_distr, suff_stats) :
+            `doc_topic_distr` is unnormalized topic distribution for each
+            document. In the literature, this is called `gamma`.
+            `suff_stats` is expected sufficient statistics for the M-step.
+            When `cal_sstats == False`, it will be None.
+
+        """
+
+        # Run e-step in parallel
+        random_state = self.random_state_ if random_init else None
+
+        # TODO: make Parallel._effective_n_jobs public instead?
+        n_jobs = effective_n_jobs(self.n_jobs)
+        if parallel is None:
+            parallel = Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1))
+        results = parallel(
+            delayed(_update_doc_distribution)(
+                X[idx_slice, :],
+                self.exp_dirichlet_component_,
+                self.doc_topic_prior_,
+                self.max_doc_update_iter,
+                self.mean_change_tol,
+                cal_sstats,
+                random_state,
+            )
+            for idx_slice in gen_even_slices(X.shape[0], n_jobs)
+        )
+
+        # merge result
+        doc_topics, sstats_list = zip(*results)
+        doc_topic_distr = np.vstack(doc_topics)
+
+        if cal_sstats:
+            # This step finishes computing the sufficient statistics for the
+            # M-step.
+            suff_stats = np.zeros(self.components_.shape, dtype=self.components_.dtype)
+            for sstats in sstats_list:
+                suff_stats += sstats
+            suff_stats *= self.exp_dirichlet_component_
+        else:
+            suff_stats = None
+
+        return (doc_topic_distr, suff_stats)
+
+    def _em_step(self, X, total_samples, batch_update, parallel=None):
+        """EM update for 1 iteration.
+
+        update `_component` by batch VB or online VB.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Document word matrix.
+
+        total_samples : int
+            Total number of documents. It is only used when
+            batch_update is `False`.
+
+        batch_update : bool
+            Parameter that controls updating method.
+            `True` for batch learning, `False` for online learning.
+
+        parallel : joblib.Parallel, default=None
+            Pre-initialized instance of joblib.Parallel
+
+        Returns
+        -------
+        doc_topic_distr : ndarray of shape (n_samples, n_components)
+            Unnormalized document topic distribution.
+        """
+
+        # E-step
+        _, suff_stats = self._e_step(
+            X, cal_sstats=True, random_init=True, parallel=parallel
+        )
+
+        # M-step
+        if batch_update:
+            self.components_ = self.topic_word_prior_ + suff_stats
+        else:
+            # online update
+            # In the literature, the weight is `rho`
+            weight = np.power(
+                self.learning_offset + self.n_batch_iter_, -self.learning_decay
+            )
+            doc_ratio = float(total_samples) / X.shape[0]
+            self.components_ *= 1 - weight
+            self.components_ += weight * (
+                self.topic_word_prior_ + doc_ratio * suff_stats
+            )
+
+        # update `component_` related variables
+        self.exp_dirichlet_component_ = np.exp(
+            _dirichlet_expectation_2d(self.components_)
+        )
+        self.n_batch_iter_ += 1
+        return
+
+    def _more_tags(self):
+        return {
+            "preserves_dtype": [np.float64, np.float32],
+            "requires_positive_X": True,
+        }
+
+    def _check_non_neg_array(self, X, reset_n_features, whom):
+        """check X format
+
+        check X format and make sure no negative value in X.
+
+        Parameters
+        ----------
+        X :  array-like or sparse matrix
+
+        """
+        dtype = [np.float64, np.float32] if reset_n_features else self.components_.dtype
+
+        X = self._validate_data(
+            X,
+            reset=reset_n_features,
+            accept_sparse="csr",
+            dtype=dtype,
+        )
+        check_non_negative(X, whom)
+
+        return X
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def partial_fit(self, X, y=None):
+        """Online VB with Mini-Batch update.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Document word matrix.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self
+            Partially fitted estimator.
+        """
+        first_time = not hasattr(self, "components_")
+
+        X = self._check_non_neg_array(
+            X, reset_n_features=first_time, whom="LatentDirichletAllocation.partial_fit"
+        )
+        n_samples, n_features = X.shape
+        batch_size = self.batch_size
+
+        # initialize parameters or check
+        if first_time:
+            self._init_latent_vars(n_features, dtype=X.dtype)
+
+        if n_features != self.components_.shape[1]:
+            raise ValueError(
+                "The provided data has %d dimensions while "
+                "the model was trained with feature size %d."
+                % (n_features, self.components_.shape[1])
+            )
+
+        n_jobs = effective_n_jobs(self.n_jobs)
+        with Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1)) as parallel:
+            for idx_slice in gen_batches(n_samples, batch_size):
+                self._em_step(
+                    X[idx_slice, :],
+                    total_samples=self.total_samples,
+                    batch_update=False,
+                    parallel=parallel,
+                )
+
+        return self
+
+    @_fit_context(prefer_skip_nested_validation=True)
+    def fit(self, X, y=None):
+        """Learn model for the data X with variational Bayes method.
+
+        When `learning_method` is 'online', use mini-batch update.
+        Otherwise, use batch update.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Document word matrix.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        self
+            Fitted estimator.
+        """
+        X = self._check_non_neg_array(
+            X, reset_n_features=True, whom="LatentDirichletAllocation.fit"
+        )
+        n_samples, n_features = X.shape
+        max_iter = self.max_iter
+        evaluate_every = self.evaluate_every
+        learning_method = self.learning_method
+
+        batch_size = self.batch_size
+
+        # initialize parameters
+        self._init_latent_vars(n_features, dtype=X.dtype)
+        # change to perplexity later
+        last_bound = None
+        n_jobs = effective_n_jobs(self.n_jobs)
+        with Parallel(n_jobs=n_jobs, verbose=max(0, self.verbose - 1)) as parallel:
+            for i in range(max_iter):
+                if learning_method == "online":
+                    for idx_slice in gen_batches(n_samples, batch_size):
+                        self._em_step(
+                            X[idx_slice, :],
+                            total_samples=n_samples,
+                            batch_update=False,
+                            parallel=parallel,
+                        )
+                else:
+                    # batch update
+                    self._em_step(
+                        X, total_samples=n_samples, batch_update=True, parallel=parallel
+                    )
+
+                # check perplexity
+                if evaluate_every > 0 and (i + 1) % evaluate_every == 0:
+                    doc_topics_distr, _ = self._e_step(
+                        X, cal_sstats=False, random_init=False, parallel=parallel
+                    )
+                    bound = self._perplexity_precomp_distr(
+                        X, doc_topics_distr, sub_sampling=False
+                    )
+                    if self.verbose:
+                        print(
+                            "iteration: %d of max_iter: %d, perplexity: %.4f"
+                            % (i + 1, max_iter, bound)
+                        )
+
+                    if last_bound and abs(last_bound - bound) < self.perp_tol:
+                        break
+                    last_bound = bound
+
+                elif self.verbose:
+                    print("iteration: %d of max_iter: %d" % (i + 1, max_iter))
+                self.n_iter_ += 1
+
+        # calculate final perplexity value on train set
+        doc_topics_distr, _ = self._e_step(
+            X, cal_sstats=False, random_init=False, parallel=parallel
+        )
+        self.bound_ = self._perplexity_precomp_distr(
+            X, doc_topics_distr, sub_sampling=False
+        )
+
+        return self
+
+    def _unnormalized_transform(self, X):
+        """Transform data X according to fitted model.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Document word matrix.
+
+        Returns
+        -------
+        doc_topic_distr : ndarray of shape (n_samples, n_components)
+            Document topic distribution for X.
+        """
+        doc_topic_distr, _ = self._e_step(X, cal_sstats=False, random_init=False)
+
+        return doc_topic_distr
+
+    def transform(self, X):
+        """Transform data X according to the fitted model.
+
+           .. versionchanged:: 0.18
+              *doc_topic_distr* is now normalized
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Document word matrix.
+
+        Returns
+        -------
+        doc_topic_distr : ndarray of shape (n_samples, n_components)
+            Document topic distribution for X.
+        """
+        check_is_fitted(self)
+        X = self._check_non_neg_array(
+            X, reset_n_features=False, whom="LatentDirichletAllocation.transform"
+        )
+        doc_topic_distr = self._unnormalized_transform(X)
+        doc_topic_distr /= doc_topic_distr.sum(axis=1)[:, np.newaxis]
+        return doc_topic_distr
+
+    def _approx_bound(self, X, doc_topic_distr, sub_sampling):
+        """Estimate the variational bound.
+
+        Estimate the variational bound over "all documents" using only the
+        documents passed in as X. Since log-likelihood of each word cannot
+        be computed directly, we use this bound to estimate it.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Document word matrix.
+
+        doc_topic_distr : ndarray of shape (n_samples, n_components)
+            Document topic distribution. In the literature, this is called
+            gamma.
+
+        sub_sampling : bool, default=False
+            Compensate for subsampling of documents.
+            It is used in calculate bound in online learning.
+
+        Returns
+        -------
+        score : float
+
+        """
+
+        def _loglikelihood(prior, distr, dirichlet_distr, size):
+            # calculate log-likelihood
+            score = np.sum((prior - distr) * dirichlet_distr)
+            score += np.sum(gammaln(distr) - gammaln(prior))
+            score += np.sum(gammaln(prior * size) - gammaln(np.sum(distr, 1)))
+            return score
+
+        is_sparse_x = sp.issparse(X)
+        n_samples, n_components = doc_topic_distr.shape
+        n_features = self.components_.shape[1]
+        score = 0
+
+        dirichlet_doc_topic = _dirichlet_expectation_2d(doc_topic_distr)
+        dirichlet_component_ = _dirichlet_expectation_2d(self.components_)
+        doc_topic_prior = self.doc_topic_prior_
+        topic_word_prior = self.topic_word_prior_
+
+        if is_sparse_x:
+            X_data = X.data
+            X_indices = X.indices
+            X_indptr = X.indptr
+
+        # E[log p(docs | theta, beta)]
+        for idx_d in range(0, n_samples):
+            if is_sparse_x:
+                ids = X_indices[X_indptr[idx_d] : X_indptr[idx_d + 1]]
+                cnts = X_data[X_indptr[idx_d] : X_indptr[idx_d + 1]]
+            else:
+                ids = np.nonzero(X[idx_d, :])[0]
+                cnts = X[idx_d, ids]
+            temp = (
+                dirichlet_doc_topic[idx_d, :, np.newaxis] + dirichlet_component_[:, ids]
+            )
+            norm_phi = logsumexp(temp, axis=0)
+            score += np.dot(cnts, norm_phi)
+
+        # compute E[log p(theta | alpha) - log q(theta | gamma)]
+        score += _loglikelihood(
+            doc_topic_prior, doc_topic_distr, dirichlet_doc_topic, self.n_components
+        )
+
+        # Compensate for the subsampling of the population of documents
+        if sub_sampling:
+            doc_ratio = float(self.total_samples) / n_samples
+            score *= doc_ratio
+
+        # E[log p(beta | eta) - log q (beta | lambda)]
+        score += _loglikelihood(
+            topic_word_prior, self.components_, dirichlet_component_, n_features
+        )
+
+        return score
+
+    def score(self, X, y=None):
+        """Calculate approximate log-likelihood as score.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Document word matrix.
+
+        y : Ignored
+            Not used, present here for API consistency by convention.
+
+        Returns
+        -------
+        score : float
+            Use approximate bound as score.
+        """
+        check_is_fitted(self)
+        X = self._check_non_neg_array(
+            X, reset_n_features=False, whom="LatentDirichletAllocation.score"
+        )
+
+        doc_topic_distr = self._unnormalized_transform(X)
+        score = self._approx_bound(X, doc_topic_distr, sub_sampling=False)
+        return score
+
+    def _perplexity_precomp_distr(self, X, doc_topic_distr=None, sub_sampling=False):
+        """Calculate approximate perplexity for data X with ability to accept
+        precomputed doc_topic_distr
+
+        Perplexity is defined as exp(-1. * log-likelihood per word)
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Document word matrix.
+
+        doc_topic_distr : ndarray of shape (n_samples, n_components), \
+                default=None
+            Document topic distribution.
+            If it is None, it will be generated by applying transform on X.
+
+        Returns
+        -------
+        score : float
+            Perplexity score.
+        """
+        if doc_topic_distr is None:
+            doc_topic_distr = self._unnormalized_transform(X)
+        else:
+            n_samples, n_components = doc_topic_distr.shape
+            if n_samples != X.shape[0]:
+                raise ValueError(
+                    "Number of samples in X and doc_topic_distr do not match."
+                )
+
+            if n_components != self.n_components:
+                raise ValueError("Number of topics does not match.")
+
+        current_samples = X.shape[0]
+        bound = self._approx_bound(X, doc_topic_distr, sub_sampling)
+
+        if sub_sampling:
+            word_cnt = X.sum() * (float(self.total_samples) / current_samples)
+        else:
+            word_cnt = X.sum()
+        perword_bound = bound / word_cnt
+
+        return np.exp(-1.0 * perword_bound)
+
+    def perplexity(self, X, sub_sampling=False):
+        """Calculate approximate perplexity for data X.
+
+        Perplexity is defined as exp(-1. * log-likelihood per word)
+
+        .. versionchanged:: 0.19
+           *doc_topic_distr* argument has been deprecated and is ignored
+           because user no longer has access to unnormalized distribution
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Document word matrix.
+
+        sub_sampling : bool
+            Do sub-sampling or not.
+
+        Returns
+        -------
+        score : float
+            Perplexity score.
+        """
+        check_is_fitted(self)
+        X = self._check_non_neg_array(
+            X, reset_n_features=True, whom="LatentDirichletAllocation.perplexity"
+        )
+        return self._perplexity_precomp_distr(X, sub_sampling=sub_sampling)
+
+    @property
+    def _n_features_out(self):
+        """Number of transformed output features."""
+        return self.components_.shape[0]