• Docs >
  • Module code >
  • query_strategies.representativeness_sampling_distances
Shortcuts

Source code for query_strategies.representativeness_sampling_distances

"""Distance-based representativeness sampling strategy"""

import math
from typing import List, Literal

import psutil
import numpy as np
import scipy

from datasets import ActiveLearningDataModule
from models.pytorch_model import PytorchModel
from .representativeness_sampling_strategy_base import (
    RepresentativenessSamplingStrategyBase,
)


[docs]class DistanceBasedRepresentativenessSamplingStrategy( RepresentativenessSamplingStrategyBase ): # pylint: disable=too-few-public-methods """ Representativeness sampling strategy that selects the items with the highest average feature distance to the items in the training set. Args: feature_type (string, optional): Type of feature vectors to be used: `"model_features"` | `"image_features"`: - `"model_features"`: Feature vectors retrieved from the inner layers of the model are used. - `"image_features"`: The input images are used as feature vectors. Defaults to `"model_features"`. feature_dimensionality (int, optional): Number of dimensions the reduced feature vector should have. Defaults to 10. distance_metric (string, optional): Metric to be used for calculation the distance between feature vectors: `"euclidean"` | `"cosine"` | `"russellrao"`. """ def __init__( self, feature_type: Literal["model_features", "image_features"] = "model_features", feature_dimensionality: int = 10, distance_metric: Literal["euclidean", "cosine", "russellrao"] = "euclidean", **kwargs, ): super().__init__( feature_type=feature_type, feature_dimensionality=feature_dimensionality, **kwargs, ) if distance_metric not in ["euclidean", "cosine", "russellrao"]: raise ValueError(f"Invalid distance metric: {distance_metric}.") self.distance_metric = distance_metric def _average_feature_distances( self, feature_vectors_training_set: np.array, feature_vectors_unlabeled_set: np.array, ) -> np.array: """ Computes average distances between the feature vectors from the unlabeled set and the feature vectors from the training set. Args: feature_vectors_training_set (numpy.array): Feature vectors from the training set. feature_vectors_unlabeled_set (numpy.array): Feature vectors from the unlabeled set. Returns: np.array: For each feature vector from the unlabeled set, average distance to the feature vectors from the training set """ # as the feature vectors possibly might be large, the feature vectors from the unlabeled set are split into # chunks so that one chunk of feature vectors from the unlabeled set and all feature vectors from the training # set fit into memory free_memory = psutil.virtual_memory().available memory_consumption_feature_vectors_training_set = np.zeros( len(feature_vectors_training_set) ).nbytes split_size = math.floor( max( math.floor( free_memory / memory_consumption_feature_vectors_training_set ) - 1, 1, ) ) n_splits = math.ceil(len(feature_vectors_unlabeled_set) / split_size) feature_vectors_unlabeled_set_splitted = np.array_split( feature_vectors_unlabeled_set, n_splits ) average_feature_distances = np.zeros(len(feature_vectors_unlabeled_set)) for idx, current_chunk_feature_vectors_unlabeled_set in enumerate( feature_vectors_unlabeled_set_splitted ): feature_distances = scipy.spatial.distance.cdist( current_chunk_feature_vectors_unlabeled_set, feature_vectors_training_set, self.distance_metric, ) average_distances_for_current_chunk = feature_distances.mean(axis=1) split_size = len(current_chunk_feature_vectors_unlabeled_set) start_index = idx * split_size end_index = (idx + 1) * split_size average_feature_distances[ start_index:end_index ] = average_distances_for_current_chunk return average_feature_distances
[docs] def compute_representativeness_scores( self, model: PytorchModel, data_module: ActiveLearningDataModule, feature_vectors_training_set, feature_vectors_unlabeled_set, case_ids_unlabeled_set, ) -> List[float]: """ Computes representativeness scores for all unlabeled items. Args: model (PytorchModel): Current model that should be improved by selecting additional data for labeling. data_module (ActiveLearningDataModule): A data module object providing data. feature_vectors_training_set (np.ndarray): Feature vectors of the items in the training set. feature_vectors_unlabeled_set (np.ndarray): Feature vectors of the items in the unlabeled set. case_ids_unlabeled_set (List[str]): Case IDs of the items in the unlabeled set. Returns: List[float]: Representativeness score for each item in the unlabeled set. Items that are underrepresented in the training receive higher scores. """ return self._average_feature_distances( feature_vectors_training_set, feature_vectors_unlabeled_set )

Docs

Access comprehensive developer documentation for Active Segmentation

View Docs