• Docs >
  • Module code >
  • query_strategies.representativeness_sampling_uncertainty
Shortcuts

Source code for query_strategies.representativeness_sampling_uncertainty

""" Combined representativeness and uncertainty sampling strategy """

from typing import List, Literal

import numpy as np

from datasets import ActiveLearningDataModule
from models.pytorch_model import PytorchModel

from .representativeness_sampling_strategy_base import (
    RepresentativenessSamplingStrategyBase,
)
from .representativeness_sampling_distances import (
    DistanceBasedRepresentativenessSamplingStrategy,
)
from .representativeness_sampling_clustering import (
    ClusteringBasedRepresentativenessSamplingStrategy,
)
from .uncertainty_sampling_strategy import UncertaintySamplingStrategy


[docs]class UncertaintyRepresentativenessSamplingStrategy( RepresentativenessSamplingStrategyBase ): """ Sampling strategy that combines representativeness and uncertainty sampling. Args: representativeness_algorithm (string, optional): The algorithm to be used to select the most representative samples: `"most_distant_sample"` | `"cluster_coverage"`. Defaults to `"cluster_coverage"`. - | `"most_distant_sample"`: The unlabeled item that has the highest feature distance to the labeled set | is selected for labeling. - | `"cluster_coverage"`: The features of the unlabeled and labeled items are clustered and an item from | the most underrepresented cluster is selected for labeling. calculation_method (string, optional): The algorithm to be used for computing the uncertainty: `"distance"` | "`entropy`". """ def __init__( self, representativeness_algorithm: Literal[ "most_distant_sample", "cluster_coverage" ] = "cluster_coverage", calculation_method: Literal["distance", "entropy"] = "entropy", **kwargs, ): super().__init__(**kwargs) if representativeness_algorithm == "most_distant_sample": self.representativeness_sampling_strategy = ( DistanceBasedRepresentativenessSamplingStrategy() ) elif representativeness_algorithm == "cluster_coverage": self.representativeness_sampling_strategy = ( ClusteringBasedRepresentativenessSamplingStrategy() ) else: raise ValueError( f"Invalid representativeness sampling algorithm: {representativeness_algorithm}." ) self.uncertainty_sampling_strategy = UncertaintySamplingStrategy( calculation_method=calculation_method )
[docs] def prepare_representativeness_computation( self, feature_vectors_training_set: np.ndarray, case_ids_training_set: List[str], feature_vectors_unlabeled_set: np.ndarray, case_ids_unlabeled_set: List[str], ) -> None: """ Prepares computation of representativeness scores. Args: feature_vectors_training_set (numpy.ndarray): Feature vectors of the items in the training set. case_ids_training_set (List[str]): Case IDs of the items in the training set. feature_vectors_unlabeled_set (numpy.ndarray): Feature vectors of the items in the unlabeled set. case_ids_unlabeled_set (List[str]): Case IDs of the items in the unlabeled set. """ self.representativeness_sampling_strategy.prepare_representativeness_computation( feature_vectors_training_set, case_ids_training_set, feature_vectors_unlabeled_set, case_ids_unlabeled_set, )
[docs] def compute_representativeness_scores( self, model: PytorchModel, data_module: ActiveLearningDataModule, feature_vectors_training_set: np.ndarray, feature_vectors_unlabeled_set: np.ndarray, case_ids_unlabeled_set: List[str], ) -> List[float]: """ Computes representativeness scores for all unlabeled items. Args: model (PytorchModel): Current model that should be improved by selecting additional data for labeling. data_module (ActiveLearningDataModule): A data module object providing data. feature_vectors_training_set (np.ndarray): Feature vectors of the items in the training set. feature_vectors_unlabeled_set (np.ndarray): Feature vectors of the items in the unlabeled set. case_ids_unlabeled_set (List[str]): Case IDs of the items in the unlabeled set. Returns: List[float]: Representativeness score for each item in the unlabeled set. Items that are underrepresented in the training receive higher scores. """ representativeness_scores = ( self.representativeness_sampling_strategy.compute_representativeness_scores( model, data_module, feature_vectors_training_set, feature_vectors_unlabeled_set, case_ids_unlabeled_set, ) ) representativeness_scores = self._normalize_scores( np.array(representativeness_scores) ) ( uncertainty_scores, _, ) = self.uncertainty_sampling_strategy.compute_uncertainties(model, data_module) uncertainty_scores = self._normalize_scores(np.array(uncertainty_scores)) return representativeness_scores + uncertainty_scores
@staticmethod def _normalize_scores(scores: np.ndarray) -> np.ndarray: """ Normalizes vector of representativeness scores. Args: scores (np.ndarray): Vector to be normalized. Returns: np.ndarray: Normalized vector. """ return (scores - scores.min(keepdims=True)) / ( scores.max(keepdims=True) - scores.min(keepdims=True) )

Docs

Access comprehensive developer documentation for Active Segmentation

View Docs