Source code for pycave.bayes.gmm.estimator

from __future__ import annotations
import logging
from typing import Any, cast, List, Tuple
import torch
from lightkit import ConfigurableBaseEstimator
from lightkit.data import collate_tensor, DataLoader, dataset_from_tensors, TensorLike
from lightkit.estimator import PredictorMixin
from pycave.bayes.core import CovarianceType
from pycave.clustering import KMeans
from .lightning_module import (
    GaussianMixtureKmeansInitLightningModule,
    GaussianMixtureLightningModule,
    GaussianMixtureRandomInitLightningModule,
)
from .model import GaussianMixtureModel, GaussianMixtureModelConfig
from .types import GaussianMixtureInitStrategy

logger = logging.getLogger(__name__)


[docs]class GaussianMixture(
    ConfigurableBaseEstimator[GaussianMixtureModel],  # type: ignore
    PredictorMixin[TensorLike, torch.Tensor],
):
    """
    Probabilistic model assuming that data is generated from a mixture of Gaussians.

    The mixture is assumed to be composed of a fixed number of components with individual means
    and covariances. More information on Gaussian mixture models (GMMs) is available on
    `Wikipedia <https://en.wikipedia.org/wiki/Mixture_model>`_.

    See also:
        .. currentmodule:: pycave.bayes.gmm
        .. autosummary::
            :nosignatures:
            :template: classes/pytorch_module.rst

            GaussianMixtureModel
            GaussianMixtureModelConfig
    """

    #: The fitted PyTorch module with all estimated parameters.
    model_: GaussianMixtureModel
    #: A boolean indicating whether the model converged during training.
    converged_: bool
    #: The number of iterations the model was fitted for, excluding initialization.
    num_iter_: int
    #: The average per-datapoint negative log-likelihood at the last training step.
    nll_: float

    def __init__(
        self,
        num_components: int = 1,
        *,
        covariance_type: CovarianceType = "diag",
        init_strategy: GaussianMixtureInitStrategy = "kmeans",
        init_means: torch.Tensor | None = None,
        convergence_tolerance: float = 1e-3,
        covariance_regularization: float = 1e-6,
        batch_size: int | None = None,
        trainer_params: dict[str, Any] | None = None,
    ):
        """
        Args:
            num_components: The number of components in the GMM. The dimensionality of each
                component is automatically inferred from the data.
            covariance_type: The type of covariance to assume for all Gaussian components.
            init_strategy: The strategy for initializing component means and covariances.
            init_means: An optional initial guess for the means of the components. If provided,
                must be a tensor of shape ``[num_components, num_features]``. If this is given,
                the ``init_strategy`` is ignored and the means are handled as if K-means
                initialization has been run.
            convergence_tolerance: The change in the per-datapoint negative log-likelihood which
                implies that training has converged.
            covariance_regularization: A small value which is added to the diagonal of the
                covariance matrix to ensure that it is positive semi-definite.
            batch_size: The batch size to use when fitting the model. If not provided, the full
                data will be used as a single batch. Set this if the full data does not fit into
                memory.
            num_workers: The number of workers to use for loading the data. Only used if a PyTorch
                dataset is passed to :meth:`fit` or related methods.
            trainer_params: Initialization parameters to use when initializing a PyTorch Lightning
                trainer. By default, it disables various stdout logs unless PyCave is configured to
                do verbose logging. Checkpointing and logging are disabled regardless of the log
                level. This estimator further sets the following overridable defaults:

                - ``max_epochs=100``

        Note:
            The number of epochs passed to the initializer only define the number of optimization
            epochs. Prior to that, initialization is run which may perform additional iterations
            through the data.

        Note:
            For batch training, the number of epochs run (i.e. the number of passes through the
            data), does not align with the number of epochs passed to the initializer. This is
            because the EM algorithm needs to be split up across two epochs. The actual number of
            minimum/maximum epochs is, thus, doubled. Nonetheless, :attr:`num_iter_` indicates how
            many EM iterations have been run.
        """
        super().__init__(
            default_params=dict(max_epochs=100),
            user_params=trainer_params,
        )

        self.num_components = num_components
        self.covariance_type = covariance_type
        self.init_strategy = init_strategy
        self.init_means = init_means
        self.convergence_tolerance = convergence_tolerance
        self.covariance_regularization = covariance_regularization

        self.batch_size = batch_size

[docs]    def fit(self, data: TensorLike) -> GaussianMixture:
        """
        Fits the Gaussian mixture on the provided data, estimating component priors, means and
        covariances. Parameters are estimated using the EM algorithm.

        Args:
            data: The tabular data to fit on. The dimensionality of the Gaussian mixture is
                automatically inferred from this data.

        Returns:
            The fitted Gaussian mixture.
        """
        # Initialize the model
        num_features = len(data[0])
        config = GaussianMixtureModelConfig(
            num_components=self.num_components,
            num_features=num_features,
            covariance_type=self.covariance_type,  # type: ignore
        )
        self.model_ = GaussianMixtureModel(config)

        # Setup the data loading
        loader = DataLoader(
            dataset_from_tensors(data),
            batch_size=self.batch_size or len(data),
            collate_fn=collate_tensor,
        )
        is_batch_training = self._num_batches_per_epoch(loader) == 1

        # Run k-means if required or copy means
        if self.init_means is not None:
            self.model_.means.copy_(self.init_means)
        elif self.init_strategy in ("kmeans", "kmeans++"):
            logger.info("Fitting K-means estimator...")
            params = self.trainer_params_user
            if self.init_strategy == "kmeans++":
                params = {**(params or {}), **dict(max_epochs=0)}

            estimator = KMeans(
                self.num_components,
                batch_size=self.batch_size,
                trainer_params=params,
            ).fit(data)
            self.model_.means.copy_(estimator.model_.centroids)

        # Run initialization
        logger.info("Running initialization...")
        if self.init_strategy in ("kmeans", "kmeans++") and self.init_means is None:
            module = GaussianMixtureKmeansInitLightningModule(
                self.model_,
                covariance_regularization=self.covariance_regularization,
            )
            self.trainer(max_epochs=1).fit(module, loader)
        else:
            module = GaussianMixtureRandomInitLightningModule(
                self.model_,
                covariance_regularization=self.covariance_regularization,
                is_batch_training=is_batch_training,
                use_model_means=self.init_means is not None,
            )
            self.trainer(max_epochs=1 + int(is_batch_training)).fit(module, loader)

        # Fit model
        logger.info("Fitting Gaussian mixture...")
        module = GaussianMixtureLightningModule(
            self.model_,
            convergence_tolerance=self.convergence_tolerance,
            covariance_regularization=self.covariance_regularization,
            is_batch_training=is_batch_training,
        )
        trainer = self.trainer(
            max_epochs=cast(int, self.trainer_params["max_epochs"]) * (1 + int(is_batch_training))
        )
        trainer.fit(module, loader)

        # Assign convergence properties
        self.num_iter_ = module.current_epoch
        if is_batch_training:
            self.num_iter_ //= 2
        self.converged_ = trainer.should_stop
        self.nll_ = cast(float, trainer.callback_metrics["nll"].item())
        return self

[docs]    def sample(self, num_datapoints: int) -> torch.Tensor:
        """
        Samples datapoints from the fitted Gaussian mixture.

        Args:
            num_datapoints: The number of datapoints to sample.

        Returns:
            A tensor of shape ``[num_datapoints, dim]`` providing the samples.

        Note:
            This method does not parallelize across multiple processes, i.e. performs no
            synchronization.
        """
        return self.model_.sample(num_datapoints)

[docs]    def score(self, data: TensorLike) -> float:
        """
        Computes the average negative log-likelihood (NLL) of the provided datapoints.

        Args:
            data: The datapoints for which to evaluate the NLL.

        Returns:
            The average NLL of all datapoints.

        Note:
            See :meth:`score_samples` to obtain NLL values for individual datapoints.
        """
        loader = DataLoader(
            dataset_from_tensors(data),
            batch_size=self.batch_size or len(data),
            collate_fn=collate_tensor,
        )
        result = self.trainer().test(
            GaussianMixtureLightningModule(self.model_), loader, verbose=False
        )
        return result[0]["nll"]

[docs]    def score_samples(self, data: TensorLike) -> torch.Tensor:
        """
        Computes the negative log-likelihood (NLL) of each of the provided datapoints.

        Args:
            data: The datapoints for which to compute the NLL.

        Returns:
            A tensor of shape ``[num_datapoints]`` with the NLL for each datapoint.

        Attention:
            When calling this function in a multi-process environment, each process receives only
            a subset of the predictions. If you want to aggregate predictions, make sure to gather
            the values returned from this method.
        """
        loader = DataLoader(
            dataset_from_tensors(data),
            batch_size=self.batch_size or len(data),
            collate_fn=collate_tensor,
        )
        result = self.trainer().predict(GaussianMixtureLightningModule(self.model_), loader)
        return torch.stack([x[1] for x in cast(List[Tuple[torch.Tensor, torch.Tensor]], result)])

[docs]    def predict(self, data: TensorLike) -> torch.Tensor:
        """
        Computes the most likely components for each of the provided datapoints.

        Args:
            data: The datapoints for which to obtain the most likely components.

        Returns:
            A tensor of shape ``[num_datapoints]`` with the indices of the most likely components.

        Note:
            Use :meth:`predict_proba` to obtain probabilities for each component instead of the
            most likely component only.

        Attention:
            When calling this function in a multi-process environment, each process receives only
            a subset of the predictions. If you want to aggregate predictions, make sure to gather
            the values returned from this method.
        """
        return self.predict_proba(data).argmax(-1)

[docs]    def predict_proba(self, data: TensorLike) -> torch.Tensor:
        """
        Computes a distribution over the components for each of the provided datapoints.

        Args:
            data: The datapoints for which to compute the component assignment probabilities.

        Returns:
            A tensor of shape ``[num_datapoints, num_components]`` with the assignment
            probabilities for each component and datapoint. Note that each row of the vector sums
            to 1, i.e. the returned tensor provides a proper distribution over the components for
            each datapoint.

        Attention:
            When calling this function in a multi-process environment, each process receives only
            a subset of the predictions. If you want to aggregate predictions, make sure to gather
            the values returned from this method.
        """
        loader = DataLoader(
            dataset_from_tensors(data),
            batch_size=self.batch_size or len(data),
            collate_fn=collate_tensor,
        )
        result = self.trainer().predict(GaussianMixtureLightningModule(self.model_), loader)
        return torch.cat([x[0] for x in cast(List[Tuple[torch.Tensor, torch.Tensor]], result)])