Source code for thelper.optim.metrics

"""Metrics module.

This module contains classes that implement metrics used to monitor training sessions and evaluate models.
These metrics should all inherit from :class:`thelper.optim.metrics.Metric` to allow them to be dynamically
instantiated by the framework from a configuration file, and evaluated automatically inside a training
session. For more information on this, refer to :class:`thelper.train.base.Trainer`.
"""

import logging
from abc import abstractmethod
from typing import Any, AnyStr, Optional  # noqa: F401

import numpy as np
import sklearn.metrics
import torch

import thelper.concepts
import thelper.utils
from thelper.ifaces import ClassNamesHandler, PredictionConsumer

logger = logging.getLogger(__name__)


[docs]class Metric(PredictionConsumer): """Abstract metric interface. This interface defines basic functions required so that :class:`thelper.train.base.Trainer` can figure out how to instantiate, update, and optimize a given metric while training/evaluating a model. All metrics, by definition, must be 'optimizable'. This means that they should return a scalar value when 'evaluated' and define an optimal goal (-inf or +inf). If this is not possible, then the class should probably be derived using the more generic :class:`thelper.ifaces.PredictionConsumer` instead. """ minimize = float("-inf") """Possible value of the ``goal`` attribute of this metric.""" maximize = float("inf") """Possible value of the ``goal`` attribute of this metric."""
[docs] @abstractmethod def update(self, # see `thelper.typedefs.IterCallbackParams` for more info task, # type: thelper.tasks.utils.Task input, # type: thelper.typedefs.InputType pred, # type: thelper.typedefs.AnyPredictionType target, # type: thelper.typedefs.AnyTargetType sample, # type: thelper.typedefs.SampleType loss, # type: Optional[float] iter_idx, # type: int max_iters, # type: int epoch_idx, # type: int max_epochs, # type: int output_path, # type: AnyStr **kwargs, # type: Any ): # type: (...) -> None """Receives the latest prediction and groundtruth tensors from the training session. The data given here will be "consumed" internally, but it should NOT be modified. For example, a classification accuracy metric might accumulate the correct number of predictions in comparison to groundtruth labels, but never alter those predictions. The iteration/epoch indices may be used to 'reset' the internal state of this object when needed (for example, at the start of each new epoch). Remember that input, prediction, and target tensors received here will all have a batch dimension! The exact signature of this function should match the one of the callbacks defined in :class:`thelper.train.base.Trainer` and specified by ``thelper.typedefs.IterCallbackParams``. """ raise NotImplementedError
[docs] @abstractmethod def eval(self): """Returns the metric's evaluation result. The returned value should be a scalar. As a model improves, this scalar should get closer to the optimization goal (defined through the 'goal' attribute). This value will be queried at the end of each training epoch by the trainer. """ raise NotImplementedError
@property def goal(self): """Returns the scalar optimization goal of the metric. The returned goal can be the ``minimize`` or ``maximize`` members of ``thelper.optim.metrics.Metric`` if the class's evaluation returns a scalar value, and ``None`` otherwise. The trainer will check this value to see if monitoring the metric's evaluation result progression is possible. """ raise NotImplementedError @property def live_eval(self): """Returns whether this metric can/should be evaluated at every backprop iteration or not. By default, this returns ``True``, but implementations that are quite slow may return ``False``. """ return True
[docs]@thelper.concepts.classification @thelper.concepts.segmentation class Accuracy(Metric): r"""Classification accuracy metric interface. This is a scalar metric used to monitor the label prediction accuracy of a model. By default, it works in ``top-k`` mode, meaning that the evaluation result is given by: .. math:: \text{accuracy} = \frac{\text{nb. correct predictions}}{\text{nb. total predictions}} \cdot 100 When :math:`k>1`, a 'correct' prediction is obtained if any of the model's top :math:`k` predictions (i.e. the :math:`k` predictions with the highest score) match the groundtruth label. Otherwise, if :math:`k=1`, then only the top prediction is compared to the groundtruth label. Note that for binary classification problems, :math:`k` should always be set to 1. This metric's goal is to maximize its value :math:`\in [0,100]` (a percentage is returned). Usage example inside a session configuration file:: # ... # lists all metrics to instantiate as a dictionary "metrics": { # ... # this is the name of the example metric; it is used for lookup/printing only "top_5_accuracy": { # this type is used to instantiate the accuracy metric "type": "thelper.optim.metrics.Accuracy", # these parameters are passed to the wrapper's constructor "params": { # the top prediction count to check for a match with the groundtruth "top_k": 5 } }, # ... } # ... Todo: add support for 'dont care' target value? Attributes: top_k: number of top predictions to consider when matching with the groundtruth (default=1). max_win_size: maximum moving average window size to use (default=None, which equals dataset size). correct: total number of correct predictions stored using an array for window-based averaging. total: total number of predictions stored using an array for window-based averaging. warned_eval_bad: toggles whether the division-by-zero warning has been flagged or not. """
[docs] def __init__(self, top_k=1, max_win_size=None): """Receives the number of predictions to consider for matches (``top_k``) and the moving average window size (``window_size``). Note that by default, if ``max_win_size`` is not provided here, the value given to ``max_iters`` on the first update call will be used instead to fix the sliding window length. In any case, the smallest of ``max_iters`` and ``max_win_size`` will be used to determine the actual window size. """ assert isinstance(top_k, int) and top_k > 0, "invalid top-k value" assert max_win_size is None or (isinstance(max_win_size, int) and max_win_size > 0), \ "invalid max sliding window size (should be positive integer)" self.top_k = top_k self.max_win_size = max_win_size self.correct = None # will be instantiated on first iter self.total = None # will be instantiated on first iter self.warned_eval_bad = False
def __repr__(self): """Returns a generic print-friendly string containing info about this metric.""" return self.__class__.__module__ + "." + self.__class__.__qualname__ + \ f"(top_k={repr(self.top_k)}, max_win_size={repr(self.max_win_size)})"
[docs] def update(self, # see `thelper.typedefs.IterCallbackParams` for more info task, # type: thelper.tasks.utils.Task input, # type: thelper.typedefs.InputType pred, # type: thelper.typedefs.ClassificationPredictionType target, # type: thelper.typedefs.ClassificationTargetType sample, # type: thelper.typedefs.SampleType loss, # type: Optional[float] iter_idx, # type: int max_iters, # type: int epoch_idx, # type: int max_epochs, # type: int output_path, # type: AnyStr **kwargs, # type: Any ): # type: (...) -> None """Receives the latest class prediction and groundtruth labels from the training session. This function computes and accumulate the number of correct and total predictions in the internal arrays, cycling over the iteration index if the maximum window length is reached. The exact signature of this function should match the one of the callbacks defined in :class:`thelper.train.base.Trainer` and specified by ``thelper.typedefs.IterCallbackParams``. """ assert len(kwargs) == 0, "unexpected extra arguments present in update call" assert iter_idx is not None and max_iters is not None and iter_idx < max_iters, \ "bad iteration indices given to metric update function" curr_win_size = max_iters if self.max_win_size is None else min(self.max_win_size, max_iters) if self.correct is None or self.correct.size != curr_win_size: # each 'iteration' will have a corresponding bin with counts for that batch self.correct = np.zeros(curr_win_size, dtype=np.int64) self.total = np.zeros(curr_win_size, dtype=np.int64) curr_idx = iter_idx % curr_win_size if target is None or target.numel() == 0: # only accumulate results when groundtruth is available self.correct[curr_idx] = 0 self.total[curr_idx] = 0 return if task is not None and isinstance(task, thelper.tasks.Classification) and task.multi_label: assert pred.shape == target.shape, "prediction/gt tensors dim/shape mismatch" assert self.top_k == 1, "unexpected top k value for multi-label accuracy eval" self.correct[curr_idx] = np.equal((pred > 0.5).long(), target).cpu().numpy().sum(dtype=np.int64) else: assert pred.dim() == target.dim() + 1, "prediction/gt tensors dim mismatch (should be BxCx[...] and Bx[...])" assert pred.shape[0] == target.shape[0], "prediction/gt tensors batch size mismatch" assert pred.dim() <= 2 or pred.shape[2:] == target.shape[1:], "prediction/gt tensors array size mismatch" top_k = pred.topk(self.top_k, dim=1)[1].view(pred.shape[0], self.top_k, -1).cpu().numpy() true_k = target.view(target.shape[0], 1, -1).expand(-1, self.top_k, -1).cpu().numpy() self.correct[curr_idx] = np.any(np.equal(top_k, true_k), axis=1).sum(dtype=np.int64) self.total[curr_idx] = target.numel()
[docs] def eval(self): """Returns the current accuracy (in percentage) based on the accumulated prediction counts. Will issue a warning if no predictions have been accumulated yet. """ if self.total is None or self.total.size == 0 or np.sum(self.total) == 0: if not self.warned_eval_bad: self.warned_eval_bad = True logger.warning("category accuracy eval result invalid (set as 0.0), no results accumulated") return 0.0 return (float(np.sum(self.correct)) / float(np.sum(self.total))) * 100
[docs] def reset(self): """Toggles a reset of the metric's internal state, deallocating count arrays.""" self.correct = None self.total = None
@property def goal(self): """Returns the scalar optimization goal of this metric (maximization).""" return Metric.maximize
[docs]@thelper.concepts.regression class MeanAbsoluteError(Metric): r"""Mean absolute error metric interface. This is a scalar metric used to monitor the mean absolute deviation (or error) for a model's predictions. This regression metric can be described as: .. math:: e(x, y) = E = \{e_1,\dots,e_N\}^\top, \quad e_n = \left| x_n - y_n \right|, where :math:`N` is the batch size. If ``reduction`` is not ``'none'``, then: .. math:: \text{MAE}(x, y) = \begin{cases} \operatorname{mean}(E), & \text{if reduction } = \text{mean.}\\ \operatorname{sum}(E), & \text{if reduction } = \text{sum.} \end{cases} `x` and `y` are tensors of arbitrary shapes with a total of `n` elements each. Usage example inside a session configuration file:: # ... # lists all metrics to instantiate as a dictionary "metrics": { # ... # this is the name of the example metric; it is used for lookup/printing only "mae": { # this type is used to instantiate the error metric "type": "thelper.optim.metrics.MeanAbsoluteError", "params": { "reduction": "mean" } }, # ... } # ... Todo: add support for 'dont care' target value? Attributes: max_win_size: maximum moving average window size to use (default=None, which equals dataset size). reduction: string representing the tensor reduction strategy to use. errors: array of error values stored for window-based averaging. warned_eval_bad: toggles whether the division-by-zero warning has been flagged or not. """
[docs] def __init__(self, reduction="mean", max_win_size=None): """Receives the reduction strategy and the moving average window size (``window_size``). Note that by default, if ``max_win_size`` is not provided here, the value given to ``max_iters`` on the first update call will be used instead to fix the sliding window length. In any case, the smallest of ``max_iters`` and ``max_win_size`` will be used to determine the actual window size. """ assert max_win_size is None or (isinstance(max_win_size, int) and max_win_size > 0), \ "invalid max sliding window size (should be positive integer)" assert reduction != "none", "metric must absolutely return a scalar, must reduce" self.reduction = reduction self.max_win_size = max_win_size self.errors = None # will be instantiated on first iter self.warned_eval_bad = False
def __repr__(self): """Returns a generic print-friendly string containing info about this metric.""" return self.__class__.__module__ + "." + self.__class__.__qualname__ + \ f"(reduction={repr(self.reduction)}, max_win_size={repr(self.max_win_size)})"
[docs] def update(self, # see `thelper.typedefs.IterCallbackParams` for more info task, # type: thelper.tasks.utils.Task input, # type: thelper.typedefs.InputType pred, # type: thelper.typedefs.RegressionPredictionType target, # type: thelper.typedefs.RegressionTargetType sample, # type: thelper.typedefs.SampleType loss, # type: Optional[float] iter_idx, # type: int max_iters, # type: int epoch_idx, # type: int max_epochs, # type: int output_path, # type: AnyStr **kwargs, # type: Any ): # type: (...) -> None """Receives the latest predictions and target values from the training session. This function computes and accumulates the L1 distance between predictions and targets in the internal array, cycling over the iteration index if the maximum window length is reached. The exact signature of this function should match the one of the callbacks defined in :class:`thelper.train.base.Trainer` and specified by ``thelper.typedefs.IterCallbackParams``. """ assert len(kwargs) == 0, "unexpected extra arguments present in update call" assert iter_idx is not None and max_iters is not None and iter_idx < max_iters, \ "bad iteration indices given to metric update function" curr_win_size = max_iters if self.max_win_size is None else min(self.max_win_size, max_iters) if self.errors is None or self.errors.size != curr_win_size: # each 'iteration' will have a corresponding bin with the average L1 loss for that batch self.errors = np.asarray([None] * curr_win_size) curr_idx = iter_idx % curr_win_size if target is None or target.numel() == 0: # only accumulate results when groundtruth is available self.errors[curr_idx] = None return assert pred.shape == target.shape, "prediction/gt tensors shape mismatch" self.errors[curr_idx] = torch.nn.functional.l1_loss(pred, target, reduction=self.reduction).item()
[docs] def eval(self): """Returns the current (average) mean absolute error based on the accumulated values. Will issue a warning if no predictions have been accumulated yet. """ if self.errors is None or self.errors.size == 0 or len([d for d in self.errors if d is not None]) == 0: if not self.warned_eval_bad: self.warned_eval_bad = True logger.warning("mean absolute error eval result invalid (set as 0.0), no results accumulated") return 0.0 return np.mean([d for d in self.errors if d is not None])
[docs] def reset(self): """Toggles a reset of the metric's internal state, deallocating the errors array.""" self.errors = None
@property def goal(self): """Returns the scalar optimization goal of this metric (minimization).""" return Metric.minimize
[docs]@thelper.concepts.regression class MeanSquaredError(Metric): r"""Mean squared error metric interface. This is a scalar metric used to monitor the mean squared deviation (or error) for a model's predictions. This regression metric can be described as: .. math:: e(x, y) = E = \{e_1,\dots,e_N\}^\top, \quad e_n = \left( x_n - y_n \right)^2, where :math:`N` is the batch size. If ``reduction`` is not ``'none'``, then: .. math:: \text{MSE}(x, y) = \begin{cases} \operatorname{mean}(E), & \text{if reduction } = \text{mean.}\\ \operatorname{sum}(E), & \text{if reduction } = \text{sum.} \end{cases} `x` and `y` are tensors of arbitrary shapes with a total of `n` elements each. Usage example inside a session configuration file:: # ... # lists all metrics to instantiate as a dictionary "metrics": { # ... # this is the name of the example metric; it is used for lookup/printing only "mse": { # this type is used to instantiate the error metric "type": "thelper.optim.metrics.MeanSquaredError", "params": { "reduction": "mean" } }, # ... } # ... Todo: add support for 'dont care' target value? Attributes: max_win_size: maximum moving average window size to use (default=None, which equals dataset size). reduction: string representing the tensor reduction strategy to use. errors: array of error values stored for window-based averaging. warned_eval_bad: toggles whether the division-by-zero warning has been flagged or not. """
[docs] def __init__(self, reduction="mean", max_win_size=None): """Receives the reduction strategy and the moving average window size (``window_size``). Note that by default, if ``max_win_size`` is not provided here, the value given to ``max_iters`` on the first update call will be used instead to fix the sliding window length. In any case, the smallest of ``max_iters`` and ``max_win_size`` will be used to determine the actual window size. """ assert max_win_size is None or (isinstance(max_win_size, int) and max_win_size > 0), \ "invalid max sliding window size (should be positive integer)" assert reduction != "none", "metric must absolutely return a scalar, must reduce" self.reduction = reduction self.max_win_size = max_win_size self.errors = None # will be instantiated on first iter self.warned_eval_bad = False
def __repr__(self): """Returns a generic print-friendly string containing info about this metric.""" return self.__class__.__module__ + "." + self.__class__.__qualname__ + \ f"(reduction={repr(self.reduction)}, max_win_size={repr(self.max_win_size)})"
[docs] def update(self, # see `thelper.typedefs.IterCallbackParams` for more info task, # type: thelper.tasks.utils.Task input, # type: thelper.typedefs.InputType pred, # type: thelper.typedefs.RegressionPredictionType target, # type: thelper.typedefs.RegressionTargetType sample, # type: thelper.typedefs.SampleType loss, # type: Optional[float] iter_idx, # type: int max_iters, # type: int epoch_idx, # type: int max_epochs, # type: int output_path, # type: AnyStr **kwargs, # type: Any ): # type: (...) -> None """Receives the latest predictions and target values from the training session. This function computes and accumulates the mean squared error between predictions and targets in the internal array, cycling over the iteration index if the maximum window length is reached. The exact signature of this function should match the one of the callbacks defined in :class:`thelper.train.base.Trainer` and specified by ``thelper.typedefs.IterCallbackParams``. """ assert len(kwargs) == 0, "unexpected extra arguments present in update call" assert iter_idx is not None and max_iters is not None and iter_idx < max_iters, \ "bad iteration indices given to metric update function" curr_win_size = max_iters if self.max_win_size is None else min(self.max_win_size, max_iters) if self.errors is None or self.errors.size != curr_win_size: # each 'iteration' will have a corresponding bin with the average MSE loss for that batch self.errors = np.asarray([None] * curr_win_size) curr_idx = iter_idx % curr_win_size if target is None or target.numel() == 0: # only accumulate results when groundtruth is available self.errors[curr_idx] = None return assert pred.shape == target.shape, "prediction/gt tensors shape mismatch" self.errors[curr_idx] = torch.nn.functional.mse_loss(pred, target, reduction=self.reduction).item()
[docs] def eval(self): """Returns the current (average) mean squared error based on the accumulated values. Will issue a warning if no predictions have been accumulated yet. """ if self.errors is None or self.errors.size == 0 or len([d for d in self.errors if d is not None]) == 0: if not self.warned_eval_bad: self.warned_eval_bad = True logger.warning("mean squared error eval result invalid (set as 0.0), no results accumulated") return 0.0 return np.mean([d for d in self.errors if d is not None])
[docs] def reset(self): """Toggles a reset of the metric's internal state, deallocating the errors array.""" self.errors = None
@property def goal(self): """Returns the scalar optimization goal of this metric (minimization).""" return Metric.minimize
[docs]@thelper.concepts.classification @thelper.concepts.segmentation class ExternalMetric(Metric, ClassNamesHandler): r"""External metric wrapping interface. This interface is used to wrap external metrics and use them in the training framework. The metrics of ``sklearn.metrics`` are good candidates that have been used extensively with this interface in the past, but those of other libraries might also be compatible. Along with the name of the class to import and its constructor's parameters, the user must provide a handling mode that specifies how prediction and groundtruth data should be handled in this wrapper. Also, extra arguments such as target label names, goal information, and window sizes can be provided for specific use cases related to the selected handling mode. For now, two metric handling modes (both related to classification) are supported: * ``classif_best``: the wrapper will accumulate the predicted and groundtruth classification \ labels forwarded by the trainer and provide them to the external metric for evaluation. If \ a target label name is specified, then only classifications related to that label will be \ accumulated. This is the handling mode required for count-based classification metrics such \ as accuracy, F-Measure, precision, recall, etc. * ``classif_score``: the wrapper will accumulate the prediction score of the targeted label \ along with a boolean that indicates whether this label was the groundtruth label or not. This \ is the handling mode required for score-based classification metrics such as when computing \ the area under the ROC curve (AUC). Usage examples inside a session configuration file:: # ... # lists all metrics to instantiate as a dictionary "metrics": { # ... # this is the name of the first example metric; it is used for lookup/printing only "f1_score_reject": { # this type is used to instantiate the wrapper "type": "thelper.optim.metrics.ExternalMetric", # these parameters are passed to the wrapper's constructor "params": { # the external class to import "metric_name": "sklearn.metrics.f1_score", # the parameters passed to the external class's constructor "metric_params": {}, # the wrapper metric handling mode "metric_type": "classif_best", # the target class name (note: dataset-specific) "target_name": "reject", # the goal type of the external metric "metric_goal": "max" } }, # this is the name of the second example metric; it is used for lookup/printing only "roc_auc_accept": { # this type is used to instantiate the wrapper "type": "thelper.optim.metrics.ExternalMetric", # these parameters are passed to the wrapper's constructor "params": { # the external class to import "metric_name": "sklearn.metrics.roc_auc_score", # the parameters passed to the external class's constructor "metric_params": {}, # the wrapper metric handling mode "metric_type": "classif_score", # the target class name (note: dataset-specific) "target_name": "accept", # the goal type of the external metric "metric_goal": "max" } }, # ... } # ... Attributes: metric_goal: goal of the external metric, used for monitoring. Can be ``min`` or ``max``. metric_type: handling mode of the external metric. Can only be one of the predetermined values. metric: type of the external metric that will be instantiated when ``eval`` is called. metric_params: dictionary of parameters passed to the external metric on instantiation. target_name: name of the targeted label. Used only in handling modes related to classification. target_idx: index of the targeted label. Used only in handling modes related to classification. class_names: holds the list of class label names provided by the dataset parser. If it is not provided when the constructor is called, it will be set by the trainer at runtime. force_softmax: specifies whether a softmax operation should be applied to the prediction scores obtained from the trainer. Only used with the "classif_score" handling mode. max_win_size: maximum moving average window size to use (default=None, which equals dataset size). pred: queue used to store predictions-related values for window-based averaging. target: queue used to store groundtruth-related values for window-based averaging. """
[docs] def __init__(self, metric_name, metric_type, metric_goal, metric_params=None, target_name=None, class_names=None, max_win_size=None, force_softmax=True, live_eval=True): """Receives all necessary arguments for wrapper initialization and external metric instantiation. See :class:`thelper.optim.metrics.ExternalMetric` for information on arguments. """ assert isinstance(metric_name, str), "metric_name must be fully qualifiied class name to import" assert metric_params is None or isinstance(metric_params, dict), "metric_params must be dictionary" supported_handling_types = [ "classif_top1", "classif_best", # the former is for backwards-compat with the latter "classif_scores", "classif_score", # the former is for backwards-compat with the latter "regression", # missing impl, work in progress @@@ TODO ] assert isinstance(metric_type, str) and metric_type in supported_handling_types, \ f"unknown metric type {repr(metric_type)}" if metric_type == "classif_top1": metric_type = "classif_best" # they are identical, just overwrite for backwards compat if metric_type == "classif_scores": metric_type = "classif_score" # they are identical, just overwrite for backwards compat assert metric_goal is not None and metric_goal in ["max", "min"], "unexpected goal type" self.metric_goal = Metric.maximize if metric_goal == "max" else Metric.minimize self.metric_type = metric_type self.metric_name = metric_name self.metric = thelper.utils.import_class(metric_name) self.metric_params = metric_params if metric_params is not None else {} self.target_name = target_name self.target_idx = None self.force_softmax = None if metric_type == "classif_score": self.force_softmax = force_softmax # only useful in this case # elif "regression" in metric_type: missing impl for custom handling @@@ assert max_win_size is None or (isinstance(max_win_size, int) and max_win_size > 0), \ "invalid max sliding window size (should be positive integer)" self.max_win_size = max_win_size self.pred = None # will be instantiated on first iter self.target = None # will be instantiated on first iter self._live_eval = live_eval # could be 'False' for external impls that are pretty slow to eval ClassNamesHandler.__init__(self, class_names)
def __repr__(self): """Returns a generic print-friendly string containing info about this metric.""" return self.__class__.__module__ + "." + self.__class__.__qualname__ + \ f"(metric_name={repr(self.metric_name)}, metric_type={repr(self.metric_type)}, " + \ f"metric_goal={'min' if self.goal == Metric.minimize else 'max'}, " + \ f"metric_params={repr(self.metric_params)}, target_name={repr(self.target_name)}, " + \ f"class_names={repr(self.class_names)}, max_win_size={repr(self.max_win_size)}, " + \ f"force_softmax={repr(self.force_softmax)})" @ClassNamesHandler.class_names.setter def class_names(self, class_names): """Sets the class label names that must be predicted by the model. This is only useful in metric handling modes related to classification. The goal of having class names here is to translate a target class label (provided in the constructor) into a target class index. This is required as predictions are not mapped to their original names (in string format) before being forwarded to this object by the trainer. """ if "classif" in self.metric_type: ClassNamesHandler.class_names.fset(self, class_names) if self.target_name is not None and self.class_names is not None: assert self.target_name in self.class_indices, \ f"could not find target name {repr(self.target_name)} in class names list" self.target_idx = self.class_indices[self.target_name] else: self.target_idx = None
[docs] def update(self, # see `thelper.typedefs.IterCallbackParams` for more info task, # type: thelper.tasks.utils.Task input, # type: thelper.typedefs.InputType pred, # type: thelper.typedefs.AnyTargetType target, # type: thelper.typedefs.AnyPredictionType sample, # type: thelper.typedefs.SampleType loss, # type: Optional[float] iter_idx, # type: int max_iters, # type: int epoch_idx, # type: int max_epochs, # type: int output_path, # type: AnyStr **kwargs, # type: Any ): # type: (...) -> None """Receives the latest predictions and target values from the training session. The handling of the data received here will depend on the current metric's handling mode. The exact signature of this function should match the one of the callbacks defined in :class:`thelper.train.base.Trainer` and specified by ``thelper.typedefs.IterCallbackParams``. """ assert len(kwargs) == 0, "unexpected extra arguments present in update call" assert iter_idx is not None and max_iters is not None and iter_idx < max_iters, \ "bad iteration indices given to metric update function" curr_win_size = max_iters if self.max_win_size is None else min(self.max_win_size, max_iters) if self.pred is None or self.pred.size != curr_win_size: # each 'iteration' will have a corresponding bin with counts for that batch self.pred = np.asarray([None] * curr_win_size) self.target = np.asarray([None] * curr_win_size) curr_idx = iter_idx % curr_win_size if "classif" in self.metric_type: if hasattr(task, "class_names") and task.class_names != self.class_names: self.class_names = task.class_names if target is None or target.numel() == 0: # only accumulate results when groundtruth is available self.pred[curr_idx] = None self.target[curr_idx] = None return assert self.target_name is None or self.target_idx is not None, \ f"could not map target name '{self.target_name}' to target idx, missing class list" assert pred.shape[0] == target.shape[0], "prediction/gt tensors batch size mismatch" if self.target_idx is not None: y_true, y_pred = [], [] if self.metric_type == "classif_best": assert pred.dim() == 2 and target.dim() == 1, "current ext metric implementation only supports batched 1D outputs" pred_label = pred.topk(1, dim=1)[1].view(pred.shape[0]) assert pred_label.numel() == target.numel(), "pred/target classification element count mismatch" must_keep = [y_pred == self.target_idx or y_true == self.target_idx for y_pred, y_true in zip(pred_label, target)] for idx, keep in enumerate(must_keep): if keep: y_true.append(target[idx].item() == self.target_idx) y_pred.append(pred_label[idx].item() == self.target_idx) else: # self.metric_type == "classif_score" if self.force_softmax: with torch.no_grad(): pred = torch.nn.functional.softmax(pred, dim=1) if pred.dim() == 2 and target.dim() == 1: for idx, tgt in enumerate(target): y_true.append(tgt.item() == self.target_idx) y_pred.append(pred[idx, self.target_idx].item()) else: assert pred.dim() > 2 and target.dim() == pred.dim() - 1 and pred.shape[2:] == target.shape[1:] y_true = (target.reshape(-1) == self.target_idx).cpu().numpy() y_pred = pred[:, self.target_idx, ...].reshape(-1).cpu().numpy() self.target[curr_idx] = y_true self.pred[curr_idx] = y_pred else: assert self.metric_type != "classif_score", "score-based classif analysis (e.g. roc auc) must specify target label" if self.metric_type == "classif_best": self.target[curr_idx] = [target[idx].item() for idx in range(pred.numel())] self.pred[curr_idx] = [pred[idx].item() for idx in range(pred.numel())] else: # if self.metric_type == "regression": raise NotImplementedError
[docs] def eval(self): """Returns the external metric's evaluation result.""" if "classif" in self.metric_type: assert self.target.size == self.pred.size, "internal window size mismatch" pred, target = zip(*[(pred, target) for preds, targets in zip(self.pred, self.target) if targets is not None for pred, target in zip(preds, targets)]) return self.metric(np.stack(target, axis=0), np.stack(pred, axis=0), **self.metric_params) else: # if self.metric_type == "regression": raise NotImplementedError
[docs] def reset(self): """Toggles a reset of the metric's internal state, emptying pred/target queues.""" self.pred = None self.target = None
@property def goal(self): """Returns the scalar optimization goal of this metric (user-defined).""" return self.metric_goal @property def live_eval(self): """Returns whether this metric can/should be evaluated at every backprop iteration or not. By default, this returns ``True``, but implementations that are quite slow may return ``False``. """ return self._live_eval
[docs]@thelper.concepts.classification @thelper.concepts.segmentation class ROCCurve(Metric, ClassNamesHandler): """Receiver operating characteristic (ROC) computation interface. This class provides an interface to ``sklearn.metrics.roc_curve`` and ``sklearn.metrics.roc_auc_score`` that can produce various types of ROC-related information including the area under the curve (AUC), the false positive and negative rates for various operating points, and the ROC curve itself as an image (also compatible with tensorboardX). By default, evaluating this metric returns the Area Under the Curve (AUC). If a target operating point is set, it will instead return the false positive/negative prediction rate of the model at that point. Usage examples inside a session configuration file:: # ... # lists all metrics to instantiate as a dictionary "metrics": { # ... # this is the name of the first example; it will output the AUC of the "reject" class "roc_reject_auc": { # this type is used to instantiate the ROC metric "type": "thelper.optim.metrics.ROCCurve", # these parameters are passed to the constructor "params": { # the name of the class to evaluate "target_name": "reject" } }, # this is the name of the second example; it will output the FPR at TPR=0.99 "roc_reject_0.99tpr": { # this type is used to instantiate the ROC metric "type": "thelper.optim.metrics.ROCCurve", # these parameters are passed to the constructor "params": { # the name of the class to evaluate "target_name": "reject", # the target true positive rate (TPR) operating point "target_tpr": 0.99 } }, # ... } # ... Attributes: target_inv: used to target all classes except the named one(s); experimental! target_name: name of targeted class to generate the roc curve/auc information for. target_tpr: target operating point in terms of true positive rate (provided in constructor). target_fpr: target operating point in terms of false positive rate (provided in constructor). target_idx: index of the targeted class, mapped from target_name using the class_names list. class_names: holds the list of class label names provided by the dataset parser. If it is not provided when the constructor is called, it will be set by the trainer at runtime. force_softmax: specifies whether a softmax operation should be applied to the prediction scores obtained from the trainer. curve: roc curve generator function, called at evaluation time to generate the output string. auc: auc score generator function, called at evaluation time to generate the output string. score: queue used to store prediction score values for window-based averaging. true: queue used to store groundtruth label values for window-based averaging. """
[docs] def __init__(self, target_name, target_tpr=None, target_fpr=None, class_names=None, force_softmax=True, sample_weight=None, drop_intermediate=True): """Receives the target class/operating point info, log parameters, and roc computation arguments. Args: target_name: name of targeted class to generate the roc curve/auc information for. target_tpr: target operating point in terms of true positive rate (provided in constructor). target_fpr: target operating point in terms of false positive rate (provided in constructor). class_names: holds the list of class label names provided by the dataset parser. If it is not provided when the constructor is called, it will be set by the trainer at runtime. force_softmax: specifies whether a softmax operation should be applied to the prediction scores obtained from the trainer. sample_weight: passed to ``sklearn.metrics.roc_curve`` and ``sklearn.metrics.roc_auc_score``. drop_intermediate: passed to ``sklearn.metrics.roc_curve``. """ assert target_name is not None, "must provide a target (class) name for ROC metric" self.target_inv = False if isinstance(target_name, str) and target_name[0] == "!": self.target_inv = True self.target_name = target_name.split("!", 1)[1] else: self.target_name = target_name self.target_tpr, self.target_fpr = None, None assert target_tpr is None or target_fpr is None, "must specify only one of target_fpr and target_tpr, not both" if target_tpr is not None or target_fpr is not None: target_xpr = target_tpr if target_tpr is not None else target_fpr assert isinstance(target_xpr, float), "expected float type for target operating point" assert 0 <= target_xpr <= 1, "invalid target operation point value (must be in [0,1])" if target_tpr is not None: self.target_tpr = target_tpr else: # if target_fpr is not None self.target_fpr = target_fpr self.target_idx = None self.force_softmax = force_softmax self.sample_weight = sample_weight self.drop_intermediate = drop_intermediate def gen_curve(y_true, y_score, _target_idx, _target_inv, _sample_weight=sample_weight, _drop_intermediate=drop_intermediate): assert _target_idx is not None, "missing positive target idx at run time" _y_true, _y_score = [], [] for sample_idx, label_idx in enumerate(y_true): _y_true.append(label_idx != _target_idx if _target_inv else label_idx == _target_idx) _y_score.append(1 - y_score[sample_idx, _target_idx] if _target_inv else y_score[sample_idx, _target_idx]) res = sklearn.metrics.roc_curve(_y_true, _y_score, sample_weight=_sample_weight, drop_intermediate=_drop_intermediate) return res def gen_auc(y_true, y_score, _target_idx, _target_inv, _sample_weight=sample_weight): assert _target_idx is not None, "missing positive target idx at run time" _y_true, _y_score = [], [] for sample_idx, label_idx in enumerate(y_true): _y_true.append(label_idx != _target_idx if _target_inv else label_idx == _target_idx) _y_score.append(1 - y_score[sample_idx, _target_idx] if _target_inv else y_score[sample_idx, _target_idx]) res = sklearn.metrics.roc_auc_score(_y_true, _y_score, sample_weight=_sample_weight) return res self.curve = gen_curve self.auc = gen_auc self.score = None self.true = None ClassNamesHandler.__init__(self, class_names)
def __repr__(self): """Returns a generic print-friendly string containing info about this metric.""" return self.__class__.__module__ + "." + self.__class__.__qualname__ + \ f"(target_name={repr(self.target_name)}, target_tpr={repr(self.target_tpr)}, " + \ f"target_fpr={repr(self.target_fpr)}, class_names={repr(self.class_names)}, " + \ f"force_softmax={repr(self.force_softmax)}, sample_weight={repr(self.sample_weight)}, " + \ f"drop_intermediate={repr(self.drop_intermediate)})" @ClassNamesHandler.class_names.setter def class_names(self, class_names): """Sets the class label names that must be predicted by the model.""" ClassNamesHandler.class_names.fset(self, class_names) if self.target_name is not None and self.class_names is not None: assert self.target_name in self.class_indices, \ f"could not find target name {repr(self.target_name)} in class names list" self.target_idx = self.class_indices[self.target_name] else: self.target_idx = None
[docs] def update(self, # see `thelper.typedefs.IterCallbackParams` for more info task, # type: thelper.tasks.utils.Task input, # type: thelper.typedefs.InputType pred, # type: thelper.typedefs.AnyPredictionType target, # type: thelper.typedefs.AnyTargetType sample, # type: thelper.typedefs.SampleType loss, # type: Optional[float] iter_idx, # type: int max_iters, # type: int epoch_idx, # type: int max_epochs, # type: int output_path, # type: AnyStr **kwargs, # type: Any ): # type: (...) -> None """Receives the latest predictions and target values from the training session. The exact signature of this function should match the one of the callbacks defined in :class:`thelper.train.base.Trainer` and specified by ``thelper.typedefs.IterCallbackParams``. """ assert len(kwargs) == 0, "unexpected extra arguments present in update call" assert isinstance(task, thelper.tasks.Classification), "roc curve only impl for classif tasks" assert not task.multi_label, "roc curve only impl for non-multi-label classif tasks" assert iter_idx is not None and max_iters is not None and iter_idx < max_iters, \ "bad iteration indices given to metric update function" if self.score is None or self.score.size != max_iters: self.score = np.asarray([None] * max_iters) self.true = np.asarray([None] * max_iters) if task.class_names != self.class_names: self.class_names = task.class_names if target is None or target.numel() == 0: # only accumulate results when groundtruth is available self.score[iter_idx] = None self.true[iter_idx] = None return assert pred.dim() == 2 or target.dim() == 1, "current classif report impl only supports batched 1D outputs" assert pred.shape[0] == target.shape[0], "prediction/gt tensors batch size mismatch" assert pred.shape[1] == len(self.class_names), "unexpected prediction class dimension size" if self.force_softmax: with torch.no_grad(): pred = torch.nn.functional.softmax(pred, dim=1) self.score[iter_idx] = pred.numpy() self.true[iter_idx] = target.numpy()
[docs] def eval(self): """Returns the evaluation result (AUC/TPR/FPR). If no target operating point is set, the returned value is the AUC for the target class. If a target TPR is set, the returned value is the FPR for that operating point. If a target FPR is set, the returned value is the TPR for that operating point. """ if self.score is None or self.true is None: return None score, true = zip(*[(score, true) for scores, trues in zip(self.score, self.true) if trues is not None for score, true in zip(scores, trues)]) # if we did not specify a target operating point in terms of true/false positive rate, return AUC if self.target_tpr is None and self.target_fpr is None: return self.auc(np.stack(true, axis=0), np.stack(score, axis=0), self.target_idx, self.target_inv) # otherwise, find the opposite rate at the requested target operating point _fpr, _tpr, _thrs = self.curve(np.stack(true, axis=0), np.stack(score, axis=0), self.target_idx, self.target_inv, _drop_intermediate=False) for fpr, tpr, thrs in zip(_fpr, _tpr, _thrs): if self.target_tpr is not None and tpr >= self.target_tpr: # print("for target tpr = %.5f, fpr = %.5f at threshold = %f" % (self.target_tpr, fpr, thrs)) return fpr elif self.target_fpr is not None and fpr >= self.target_fpr: # print("for target fpr = %.5f, tpr = %.5f at threshold = %f" % (self.target_fpr, tpr, thrs)) return tpr # if we did not find a proper rate match above, return worse possible value if self.target_tpr is not None: # print("for target tpr = %.5f, fpr = 1.0 at threshold = min" % self.target_tpr) return 1.0 else: # if self.target_fpr is not None: # print("for target fpr = %.5f, tpr = 0.0 at threshold = max" % self.target_fpr) return 0.0
[docs] def render(self): """Returns the ROC curve as a numpy-compatible RGBA image drawn by pyplot.""" if self.score is None: return None score, true = zip(*[(score, true) for scores, trues in zip(self.score, self.true) if trues is not None for score, true in zip(scores, trues)]) fpr, tpr, t = self.curve(np.stack(true, axis=0), np.stack(score, axis=0), self.target_idx, self.target_inv) try: fig, ax = thelper.draw.draw_roc_curve(fpr, tpr) array = thelper.draw.fig2array(fig) return array except AttributeError as e: logger.warning(f"failed to render roc curve; caught exception:\n{str(e)}") # return None if rendering fails (probably due to matplotlib on displayless server) return None
[docs] def reset(self): """Toggles a reset of the metric's internal state, emptying queues.""" self.score = None self.true = None
@property def goal(self): """Returns the scalar optimization goal of this metric (variable based on target op point).""" # if we did not specify a target operating point in terms of true/false positive rate, return AUC if self.target_tpr is None and self.target_fpr is None: return Metric.maximize # AUC must be maximized if self.target_tpr is not None: return Metric.minimize # fpr must be minimized else: # if self.target_fpr is not None: return Metric.maximize # tpr must be maximized @property def live_eval(self): """Returns whether this metric can/should be evaluated at every backprop iteration or not.""" return False # some operating modes might be pretty slow, check back impl later
[docs]@thelper.concepts.regression class PSNR(Metric): r"""Peak Signal-to-Noise Ratio (PSNR) metric interface. This is a scalar metric used to monitor the change in quality of a signal (or image) following a transformation. For more information, see its definition on `[Wikipedia]`__. .. __: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio The PSNR (in decibels, dB) between a modified signal :math:`x` and its original version :math:`y` is defined as: .. math:: \text{PSNR}(x, y) = 10 * \log_{10} \Bigg( \frac{R^2}{\text{MSE}(x, y)} \Bigg) where :math:`\text{MSE}(x, y)` returns the mean squared error (see :class:`thelper.optim.metrics.MeanSquaredError` for more information), and :math:`R` is the maximum possible value for a single element in the input signal (i.e. its maximum "range"). Usage example inside a session configuration file:: # ... # lists all metrics to instantiate as a dictionary "metrics": { # ... # this is the name of the example metric; it is used for lookup/printing only "psnr": { # this type is used to instantiate the metric "type": "thelper.optim.metrics.PSNR", "params": { "data_range": "255" } }, # ... } # ... Attributes: max_win_size: maximum moving average window size to use (default=None, which equals dataset size). data_range: maximum value of an element in the target signal. psnrs: array of psnr values stored for window-based averaging. warned_eval_bad: toggles whether the division-by-zero warning has been flagged or not. """
[docs] def __init__(self, data_range=1.0, max_win_size=None): """Receives all necessary initialization arguments to compute signal PSNRs, See :class:`thelper.optim.metrics.PSNR` for information on arguments. """ self.max_win_size = max_win_size self.psnrs = None # will be instantiated on first iter self.warned_eval_bad = False self.data_range = data_range
def __repr__(self): """Returns a generic print-friendly string containing info about this metric.""" return self.__class__.__module__ + "." + self.__class__.__qualname__ + \ f"(data_range={repr(self.data_range)}, max_win_size={repr(self.max_win_size)})"
[docs] def update(self, # see `thelper.typedefs.IterCallbackParams` for more info task, # type: thelper.tasks.utils.Task input, # type: thelper.typedefs.InputType pred, # type: thelper.typedefs.RegressionPredictionType target, # type: thelper.typedefs.RegressionTargetType sample, # type: thelper.typedefs.SampleType loss, # type: Optional[float] iter_idx, # type: int max_iters, # type: int epoch_idx, # type: int max_epochs, # type: int output_path, # type: AnyStr **kwargs, # type: Any ): # type: (...) -> None """Receives the latest predictions and target values from the training session. The exact signature of this function should match the one of the callbacks defined in :class:`thelper.train.base.Trainer` and specified by ``thelper.typedefs.IterCallbackParams``. """ assert len(kwargs) == 0, "unexpected extra arguments present in update call" assert iter_idx is not None and max_iters is not None and iter_idx < max_iters, \ "bad iteration indices given to metric update function" curr_win_size = max_iters if self.max_win_size is None else min(self.max_win_size, max_iters) if self.psnrs is None or self.psnrs.size != curr_win_size: # each 'iteration' will have a corresponding bin with the psnr for that batch self.psnrs = np.asarray([None] * curr_win_size) curr_idx = iter_idx % curr_win_size if target is None or target.numel() == 0: # only accumulate results when groundtruth is available self.psnrs[curr_idx] = None return assert pred.shape == target.shape, "prediction/gt tensors shape mismatch" mse = np.mean(np.square(pred.numpy() - target.numpy()), dtype=np.float64) self.psnrs[curr_idx] = 10 * np.log10(self.data_range / mse)
[docs] def eval(self): """Returns the current (average) PSNR based on the accumulated values. Will issue a warning if no predictions have been accumulated yet. """ if self.psnrs is None or self.psnrs.size == 0 or len([v for v in self.psnrs if v is not None]) == 0: if not self.warned_eval_bad: self.warned_eval_bad = True logger.warning("psnr eval result invalid (set as 0.0), no results accumulated") return 0.0 return np.mean([v for v in self.psnrs if v is not None])
[docs] def reset(self): """Toggles a reset of the metric's internal state, deallocating the psnrs array.""" self.psnrs = None
@property def goal(self): """Returns the scalar optimization goal of this metric (maximization).""" return Metric.maximize
[docs]@thelper.concepts.detection class AveragePrecision(Metric): r"""Object detection average precision score from PascalVOC. This metric is computed based on the evaluator function implemented in :mod:`thelper.optim.eval`. It can target a single class at a time, or produce the mean average precision for all classes. Usage example inside a session configuration file:: # ... # lists all metrics to instantiate as a dictionary "metrics": { # ... # this is the name of the example metric; it is used for lookup/printing only "mAP": { # this type is used to instantiate the AP metric "type": "thelper.optim.metrics.AveragePrecision", # these parameters are passed to the wrapper's constructor "params": { # no parameters means we will compute the mAP } }, # ... } # ... Attributes: target_class: name of the class to target; if 'None', will compute mAP instead of AP. iou_threshold: Intersection Over Union (IOU) threshold for true/false positive classification. method: the evaluation method to use; can be the the latest & official PASCAL VOC toolkit approach ("all-points"), or the 11-point approach ("11-points") described in the original paper ("The PASCAL Visual Object Classes(VOC) Challenge"). max_win_size: maximum moving average window size to use (default=None, which equals dataset size). preds: array holding the predicted bounding boxes for all input samples. targets: array holding the target bounding boxes for all input samples. """
[docs] def __init__(self, target_class=None, iou_threshold=0.5, method="all-points", max_win_size=None): """Initializes metric attributes. Note that by default, if ``max_win_size`` is not provided here, the value given to ``max_iters`` on the first update call will be used instead to fix the sliding window length. In any case, the smallest of ``max_iters`` and ``max_win_size`` will be used to determine the actual window size. """ assert max_win_size is None or (isinstance(max_win_size, int) and max_win_size > 0), \ "invalid max sliding window size (should be positive integer)" self.target_class = target_class self.iou_threshold = iou_threshold self.method = method self.max_win_size = max_win_size self.preds = None # will be instantiated on first iter self.targets = None # will be instantiated on first iter self.task = None
def __repr__(self): """Returns a generic print-friendly string containing info about this metric.""" return self.__class__.__module__ + "." + self.__class__.__qualname__ + \ f"(target_class={repr(self.target_class)}, max_win_size={repr(self.max_win_size)})"
[docs] def update(self, # see `thelper.typedefs.IterCallbackParams` for more info task, # type: thelper.tasks.utils.Task input, # type: thelper.typedefs.InputType pred, # type: thelper.typedefs.DetectionPredictionType target, # type: thelper.typedefs.DetectionTargetType sample, # type: thelper.typedefs.SampleType loss, # type: Optional[float] iter_idx, # type: int max_iters, # type: int epoch_idx, # type: int max_epochs, # type: int output_path, # type: AnyStr **kwargs): # type: (...) -> None """Receives the latest bbox predictions and targets from the training session. The exact signature of this function should match the one of the callbacks defined in :class:`thelper.train.base.Trainer` and specified by ``thelper.typedefs.IterCallbackParams``. """ assert len(kwargs) == 0, "unexpected extra arguments present in update call" assert iter_idx is not None and max_iters is not None and iter_idx < max_iters, \ "bad iteration indices given to metric update function" curr_win_size = max_iters if self.max_win_size is None else min(self.max_win_size, max_iters) if self.preds is None or self.preds.size != curr_win_size: # each 'iteration' will have a corresponding bin with counts for that batch self.preds = np.asarray([None] * curr_win_size) self.targets = np.asarray([None] * curr_win_size) curr_idx = iter_idx % curr_win_size self.task = task # keep reference for eval only if target is None or len(target) == 0: # only accumulate results when groundtruth is available (should we though? affects false negative count) self.preds[curr_idx] = None self.targets[curr_idx] = None return if not pred: pred = [[]] * len(target) assert isinstance(pred, list) and isinstance(target, list) assert all([isinstance(b, list) and all([isinstance(p, thelper.tasks.detect.BoundingBox) for p in b]) for b in pred]) assert all([isinstance(b, list) and all([isinstance(t, thelper.tasks.detect.BoundingBox) for t in b]) for b in target]) self.preds[curr_idx] = pred self.targets[curr_idx] = target
[docs] def eval(self): """Returns the current accuracy (in percentage) based on the accumulated prediction counts. Will issue a warning if no predictions have been accumulated yet. """ assert self.targets.size == self.preds.size, "internal window size mismatch" pred, target = zip(*[(pred, target) for preds, targets in zip(self.preds, self.targets) if targets for pred, target in zip(preds, targets)]) # maybe need to concat? pred, target = np.concatenate(pred), np.concatenate(target) # possible due to image ids if len(pred) == 0: # no predictions made by model return float("nan") metrics = thelper.optim.eval.compute_pascalvoc_metrics(pred, target, self.task, self.iou_threshold, self.method) if self.target_class is None: # compute mAP wrt classes that have at least one positive sample return np.mean([m["AP"] for m in metrics.values() if m["total positives"] > 0]) return metrics[self.target_class]["AP"]
[docs] def reset(self): """Toggles a reset of the metric's internal state, deallocating bbox arrays.""" self.preds = None self.targets = None
@property def goal(self): """Returns the scalar optimization goal of this metric (maximization).""" return Metric.maximize @property def live_eval(self): """Returns whether this metric can/should be evaluated at every backprop iteration or not.""" return False # the current PascalVOC implementation is preeetty slow with lots of bboxes
[docs]@thelper.concepts.segmentation class IntersectionOverUnion(Metric): r"""Computes the intersection over union over image classes. It can target a single class at a time, or produce the mean IoU (mIoU) for a number of classes. It can also average IoU scores from each images, or sum up all intersection and union areas and compute a global score. Usage example inside a session configuration file:: # ... # lists all metrics to instantiate as a dictionary "metrics": { # ... # this is the name of the example metric; it is used for lookup/printing only "mIoU": { # this type is used to instantiate the IoU metric "type": "thelper.optim.metrics.IntersectionOverUnion", # these parameters are passed to the wrapper's constructor "params": { # no parameters means we will compute the mIoU with global scoring } }, # ... } # ... Attributes: target_names: name(s) of the class(es) to target; if 'None' or list, will compute mIoU instead of IoU. max_win_size: maximum moving average window size to use (default=None, which equals dataset size). inters: array holding the intesection areas or IoU scores for all input samples. unions: array holding the union areas for all input samples. """
[docs] def __init__(self, target_names=None, global_score=True, max_win_size=None): """Initializes metric attributes. Note that by default, if ``max_win_size`` is not provided here, the value given to ``max_iters`` on the first update call will be used instead to fix the sliding window length. In any case, the smallest of ``max_iters`` and ``max_win_size`` will be used to determine the actual window size. """ assert max_win_size is None or (isinstance(max_win_size, int) and max_win_size > 0), \ "invalid max sliding window size (should be positive integer)" if target_names is not None and not isinstance(target_names, (list, np.ndarray, torch.Tensor)): target_names = [target_names] self.target_names = target_names self.target_idxs = None # will be updated at runtime self.global_score = global_score self.max_win_size = max_win_size self.inters = None # will be instantiated on first iter self.unions = None # will be instantiated on first iter self.task = None self.warned_eval_bad = False
def __repr__(self): """Returns a generic print-friendly string containing info about this metric.""" return self.__class__.__module__ + "." + self.__class__.__qualname__ + \ f"(target_names={repr(self.target_names)}, global_score={repr(self.global_score)}, max_win_size={repr(self.max_win_size)})"
[docs] def update(self, # see `thelper.typedefs.IterCallbackParams` for more info task, # type: thelper.tasks.utils.Task input, # type: thelper.typedefs.InputType pred, # type: thelper.typedefs.SegmentationPredictionType target, # type: thelper.typedefs.SegmentationTargetType sample, # type: thelper.typedefs.SampleType loss, # type: Optional[float] iter_idx, # type: int max_iters, # type: int epoch_idx, # type: int max_epochs, # type: int output_path, # type: AnyStr **kwargs): # type: (...) -> None """Receives the latest bbox predictions and targets from the training session. The exact signature of this function should match the one of the callbacks defined in :class:`thelper.train.base.Trainer` and specified by ``thelper.typedefs.IterCallbackParams``. """ assert len(kwargs) == 0, "unexpected extra arguments present in update call" assert iter_idx is not None and max_iters is not None and iter_idx < max_iters, \ "bad iteration indices given to metric update function" curr_win_size = max_iters if self.max_win_size is None else min(self.max_win_size, max_iters) if self.inters is None or self.inters.size != curr_win_size: # each 'iteration' will have a corresponding bin with counts for that batch self.inters = np.asarray([None] * curr_win_size) self.unions = np.asarray([None] * curr_win_size) curr_idx = iter_idx % curr_win_size if task is not None: assert isinstance(task, thelper.tasks.Segmentation), "unexpected task type with IoU metric" if self.target_names is not None: assert all([n in task.class_names for n in self.target_names]), \ "missing iou target in task class names" self.target_idxs = [task.class_indices[n] for n in self.target_names] else: self.target_idxs = list(task.class_indices.values()) self.task = task # keep reference for eval only if target is None or len(target) == 0: # only accumulate results when groundtruth is available (should we though? affects false negative count) self.inters[curr_idx] = None self.unions[curr_idx] = None return assert pred.dim() == target.dim() + 1 or pred.dim() == target.dim(), \ "prediction/gt tensors dim mismatch (should be BxCx[...] and Bx[...])" if pred.dim() == target.dim(): assert target.shape[1] == 1, "unexpected channel count (>1) for target tensor" target = torch.squeeze(target, dim=1) assert pred.dim() == target.dim() + 1, "prediction/gt tensors dim mismatch (should be BxCx[...] and Bx[...])" assert pred.shape[0] == target.shape[0], "prediction/gt tensors batch size mismatch" assert pred.dim() <= 2 or pred.shape[2:] == target.shape[1:], "prediction/gt tensors array size mismatch" with torch.no_grad(): pred_labels = pred.topk(1, dim=1)[1].view(pred.shape[0], -1).cpu().numpy() true_labels = target.view(target.shape[0], -1).cpu().numpy() assert self.task is not None, "task object necessary at this point since we need to refer to dontcare value" assert self.target_idxs, "messed up something internally..." if self.global_score: inters_count_map, union_count_map = {}, {} for target_idx in self.target_idxs: inters = np.logical_and(pred_labels == target_idx, true_labels == target_idx) inters_count_map[target_idx] = np.count_nonzero(inters) if self.task.dontcare is not None: valid_preds = np.logical_and(pred_labels == target_idx, true_labels != self.task.dontcare) union = np.logical_or(valid_preds, true_labels == target_idx) else: union = np.logical_or(pred_labels == target_idx, true_labels == target_idx) union_count_map[target_idx] = np.count_nonzero(union) self.inters[curr_idx] = inters_count_map self.unions[curr_idx] = union_count_map else: bious = [thelper.optim.eval.compute_mask_iou(pred_labels[b], true_labels[b], self.target_idxs, self.task.dontcare) for b in range(pred.shape[0])] self.inters[curr_idx] = {tidx: [ious[tidx] for ious in bious] for tidx in self.target_idxs} self.unions[curr_idx] = None
[docs] def eval(self): """Returns the current IoU ratio based on the accumulated counts. Will issue a warning if no predictions have been accumulated yet. """ assert self.inters.size == self.unions.size, "internal window size mismatch" if self.target_idxs is None: if not self.warned_eval_bad: self.warned_eval_bad = True logger.warning("iou eval result invalid (set as 0.0), no results accumulated") return 0.0 accum_pairs = {} valid = False for tidx in self.target_idxs: accum_pairs[tidx] = [], [] for i, u in zip(self.inters, self.unions): if i is not None or u is not None: accum_pairs[tidx][0].append(i[tidx] if i is not None else None) accum_pairs[tidx][1].append(u[tidx] if u is not None else None) valid = valid or i is not None if not valid: if not self.warned_eval_bad: self.warned_eval_bad = True logger.warning("iou eval result invalid (set as 0.0), no results accumulated") return 0.0 if self.global_score: tot_inters = {tidx: sum(accum_pairs[tidx][0]) for tidx in self.target_idxs} tot_union = {tidx: sum(accum_pairs[tidx][1]) for tidx in self.target_idxs} iou_map = {tidx: (tot_inters[tidx] / tot_union[tidx]) if tot_union[tidx] != 0 else 0.0 for tidx in self.target_idxs} else: iou_map = {} for tidx in self.target_idxs: ious = [] for batch_ious in accum_pairs[tidx][0]: for iou in batch_ious: ious.append(iou) iou_map[tidx] = 0.0 if not ious else np.mean(ious) # could add per-class IoU scores to some log before averaging below... return np.array(list(iou_map.values())).mean()
[docs] def reset(self): """Toggles a reset of the metric's internal state, deallocating bbox arrays.""" self.inters = None self.unions = None
@property def goal(self): """Returns the scalar optimization goal of this metric (maximization).""" return Metric.maximize