Source code for query_strategies.representativeness_sampling_uncertainty

""" Combined representativeness and uncertainty sampling strategy """

from typing import List, Literal

import numpy as np

from datasets import ActiveLearningDataModule
from models.pytorch_model import PytorchModel

from .representativeness_sampling_strategy_base import (
    RepresentativenessSamplingStrategyBase,
)
from .representativeness_sampling_distances import (
    DistanceBasedRepresentativenessSamplingStrategy,
)
from .representativeness_sampling_clustering import (
    ClusteringBasedRepresentativenessSamplingStrategy,
)
from .uncertainty_sampling_strategy import UncertaintySamplingStrategy


[docs]class UncertaintyRepresentativenessSamplingStrategy(
    RepresentativenessSamplingStrategyBase
):
    """
    Sampling strategy that combines representativeness and uncertainty sampling.

    Args:
        representativeness_algorithm (string, optional): The algorithm to be used to select the most representative
            samples: `"most_distant_sample"` | `"cluster_coverage"`. Defaults to `"cluster_coverage"`.
                - | `"most_distant_sample"`: The unlabeled item that has the highest feature distance to the labeled set
                  | is selected for labeling.
                - | `"cluster_coverage"`: The features of the unlabeled and labeled items are clustered and an item from
                  | the most underrepresented cluster is selected for labeling.
        calculation_method (string, optional): The algorithm to be used for computing the uncertainty: `"distance"` |
            "`entropy`".
    """

    def __init__(
        self,
        representativeness_algorithm: Literal[
            "most_distant_sample", "cluster_coverage"
        ] = "cluster_coverage",
        calculation_method: Literal["distance", "entropy"] = "entropy",
        **kwargs,
    ):
        super().__init__(**kwargs)

        if representativeness_algorithm == "most_distant_sample":
            self.representativeness_sampling_strategy = (
                DistanceBasedRepresentativenessSamplingStrategy()
            )
        elif representativeness_algorithm == "cluster_coverage":
            self.representativeness_sampling_strategy = (
                ClusteringBasedRepresentativenessSamplingStrategy()
            )
        else:
            raise ValueError(
                f"Invalid representativeness sampling algorithm: {representativeness_algorithm}."
            )
        self.uncertainty_sampling_strategy = UncertaintySamplingStrategy(
            calculation_method=calculation_method
        )

[docs]    def prepare_representativeness_computation(
        self,
        feature_vectors_training_set: np.ndarray,
        case_ids_training_set: List[str],
        feature_vectors_unlabeled_set: np.ndarray,
        case_ids_unlabeled_set: List[str],
    ) -> None:
        """
        Prepares computation of representativeness scores.

        Args:
            feature_vectors_training_set (numpy.ndarray): Feature vectors of the items in the training set.
            case_ids_training_set (List[str]): Case IDs of the items in the training set.
            feature_vectors_unlabeled_set (numpy.ndarray): Feature vectors of the items in the unlabeled set.
            case_ids_unlabeled_set (List[str]): Case IDs of the items in the unlabeled set.
        """

        self.representativeness_sampling_strategy.prepare_representativeness_computation(
            feature_vectors_training_set,
            case_ids_training_set,
            feature_vectors_unlabeled_set,
            case_ids_unlabeled_set,
        )

[docs]    def compute_representativeness_scores(
        self,
        model: PytorchModel,
        data_module: ActiveLearningDataModule,
        feature_vectors_training_set: np.ndarray,
        feature_vectors_unlabeled_set: np.ndarray,
        case_ids_unlabeled_set: List[str],
    ) -> List[float]:
        """
        Computes representativeness scores for all unlabeled items.

        Args:
            model (PytorchModel): Current model that should be improved by selecting additional data for labeling.
            data_module (ActiveLearningDataModule): A data module object providing data.
            feature_vectors_training_set (np.ndarray): Feature vectors of the items in the training set.
            feature_vectors_unlabeled_set (np.ndarray): Feature vectors of the items in the unlabeled set.
            case_ids_unlabeled_set (List[str]): Case IDs of the items in the unlabeled set.

        Returns:
            List[float]: Representativeness score for each item in the unlabeled set. Items that are underrepresented in
            the training receive higher scores.
        """

        representativeness_scores = (
            self.representativeness_sampling_strategy.compute_representativeness_scores(
                model,
                data_module,
                feature_vectors_training_set,
                feature_vectors_unlabeled_set,
                case_ids_unlabeled_set,
            )
        )
        representativeness_scores = self._normalize_scores(
            np.array(representativeness_scores)
        )

        (
            uncertainty_scores,
            _,
        ) = self.uncertainty_sampling_strategy.compute_uncertainties(model, data_module)
        uncertainty_scores = self._normalize_scores(np.array(uncertainty_scores))

        return representativeness_scores + uncertainty_scores

    @staticmethod
    def _normalize_scores(scores: np.ndarray) -> np.ndarray:
        """
        Normalizes vector of representativeness scores.

        Args:
            scores (np.ndarray): Vector to be normalized.

        Returns:
            np.ndarray: Normalized vector.
        """

        return (scores - scores.min(keepdims=True)) / (
            scores.max(keepdims=True) - scores.min(keepdims=True)
        )
Source code for query_strategies.representativeness_sampling_uncertainty

Docs