Source code for thelper.optim.metrics

"""Metrics module.

This module contains classes that implement metrics used to monitor training sessions and evaluate models.
These metrics should all inherit from :class:`thelper.optim.metrics.Metric` to allow them to be dynamically
instantiated by the framework from a configuration file, and evaluated automatically inside a training
session. For more information on this, refer to :class:`thelper.train.base.Trainer`.
"""

import logging
from abc import abstractmethod
from typing import Any, AnyStr, Optional  # noqa: F401

import numpy as np
import sklearn.metrics
import torch

import thelper.concepts
import thelper.utils
from thelper.ifaces import ClassNamesHandler, PredictionConsumer

logger = logging.getLogger(__name__)


[docs]class Metric(PredictionConsumer):
    """Abstract metric interface.

    This interface defines basic functions required so that :class:`thelper.train.base.Trainer` can
    figure out how to instantiate, update, and optimize a given metric while training/evaluating a model.

    All metrics, by definition, must be 'optimizable'. This means that they should return a scalar value
    when 'evaluated' and define an optimal goal (-inf or +inf). If this is not possible, then the class
    should probably be derived using the more generic :class:`thelper.ifaces.PredictionConsumer`
    instead.
    """

    minimize = float("-inf")
    """Possible value of the ``goal`` attribute of this metric."""

    maximize = float("inf")
    """Possible value of the ``goal`` attribute of this metric."""

[docs]    @abstractmethod
    def update(self,         # see `thelper.typedefs.IterCallbackParams` for more info
               task,         # type: thelper.tasks.utils.Task
               input,        # type: thelper.typedefs.InputType
               pred,         # type: thelper.typedefs.AnyPredictionType
               target,       # type: thelper.typedefs.AnyTargetType
               sample,       # type: thelper.typedefs.SampleType
               loss,         # type: Optional[float]
               iter_idx,     # type: int
               max_iters,    # type: int
               epoch_idx,    # type: int
               max_epochs,   # type: int
               output_path,  # type: AnyStr
               **kwargs,     # type: Any
               ):            # type: (...) -> None
        """Receives the latest prediction and groundtruth tensors from the training session.

        The data given here will be "consumed" internally, but it should NOT be modified. For example,
        a classification accuracy metric might accumulate the correct number of predictions in comparison
        to groundtruth labels, but never alter those predictions. The iteration/epoch indices may be
        used to 'reset' the internal state of this object when needed (for example, at the start of each
        new epoch).

        Remember that input, prediction, and target tensors received here will all have a batch dimension!

        The exact signature of this function should match the one of the callbacks defined in
        :class:`thelper.train.base.Trainer` and specified by ``thelper.typedefs.IterCallbackParams``.
        """
        raise NotImplementedError

[docs]    @abstractmethod
    def eval(self):
        """Returns the metric's evaluation result.

        The returned value should be a scalar. As a model improves, this scalar should get closer
        to the optimization goal (defined through the 'goal' attribute). This value will be queried
        at the end of each training epoch by the trainer.
        """
        raise NotImplementedError

    @property
    def goal(self):
        """Returns the scalar optimization goal of the metric.

        The returned goal can be the ``minimize`` or ``maximize`` members of ``thelper.optim.metrics.Metric``
        if the class's evaluation returns a scalar value, and ``None`` otherwise. The trainer will
        check this value to see if monitoring the metric's evaluation result progression is possible.
        """
        raise NotImplementedError

    @property
    def live_eval(self):
        """Returns whether this metric can/should be evaluated at every backprop iteration or not.

        By default, this returns ``True``, but implementations that are quite slow may return ``False``.
        """
        return True


[docs]@thelper.concepts.classification
@thelper.concepts.segmentation
class Accuracy(Metric):
    r"""Classification accuracy metric interface.

    This is a scalar metric used to monitor the label prediction accuracy of a model. By default,
    it works in ``top-k`` mode, meaning that the evaluation result is given by:

    .. math::
      \text{accuracy} = \frac{\text{nb. correct predictions}}{\text{nb. total predictions}} \cdot 100

    When :math:`k>1`, a 'correct' prediction is obtained if any of the model's top :math:`k` predictions
    (i.e. the :math:`k` predictions with the highest score) match the groundtruth label. Otherwise, if
    :math:`k=1`, then only the top prediction is compared to the groundtruth label. Note that for
    binary classification problems, :math:`k` should always be set to 1.

    This metric's goal is to maximize its value :math:`\in [0,100]` (a percentage is returned).

    Usage example inside a session configuration file::

        # ...
        # lists all metrics to instantiate as a dictionary
        "metrics": {
            # ...
            # this is the name of the example metric; it is used for lookup/printing only
            "top_5_accuracy": {
                # this type is used to instantiate the accuracy metric
                "type": "thelper.optim.metrics.Accuracy",
                # these parameters are passed to the wrapper's constructor
                "params": {
                    # the top prediction count to check for a match with the groundtruth
                    "top_k": 5
                }
            },
            # ...
        }
        # ...

    Todo: add support for 'dont care' target value?

    Attributes:
        top_k: number of top predictions to consider when matching with the groundtruth (default=1).
        max_win_size: maximum moving average window size to use (default=None, which equals dataset size).
        correct: total number of correct predictions stored using an array for window-based averaging.
        total: total number of predictions stored using an array for window-based averaging.
        warned_eval_bad: toggles whether the division-by-zero warning has been flagged or not.
    """

[docs]    def __init__(self, top_k=1, max_win_size=None):
        """Receives the number of predictions to consider for matches (``top_k``) and the moving average
        window size (``window_size``).

        Note that by default, if ``max_win_size`` is not provided here, the value given to ``max_iters`` on
        the first update call will be used instead to fix the sliding window length. In any case, the
        smallest of ``max_iters`` and ``max_win_size`` will be used to determine the actual window size.
        """
        assert isinstance(top_k, int) and top_k > 0, "invalid top-k value"
        assert max_win_size is None or (isinstance(max_win_size, int) and max_win_size > 0), \
            "invalid max sliding window size (should be positive integer)"
        self.top_k = top_k
        self.max_win_size = max_win_size
        self.correct = None  # will be instantiated on first iter
        self.total = None  # will be instantiated on first iter
        self.warned_eval_bad = False

    def __repr__(self):
        """Returns a generic print-friendly string containing info about this metric."""
        return self.__class__.__module__ + "." + self.__class__.__qualname__ + \
            f"(top_k={repr(self.top_k)}, max_win_size={repr(self.max_win_size)})"

[docs]    def update(self,         # see `thelper.typedefs.IterCallbackParams` for more info
               task,         # type: thelper.tasks.utils.Task
               input,        # type: thelper.typedefs.InputType
               pred,         # type: thelper.typedefs.ClassificationPredictionType
               target,       # type: thelper.typedefs.ClassificationTargetType
               sample,       # type: thelper.typedefs.SampleType
               loss,         # type: Optional[float]
               iter_idx,     # type: int
               max_iters,    # type: int
               epoch_idx,    # type: int
               max_epochs,   # type: int
               output_path,  # type: AnyStr
               **kwargs,     # type: Any
               ):            # type: (...) -> None
        """Receives the latest class prediction and groundtruth labels from the training session.

        This function computes and accumulate the number of correct and total predictions in
        the internal arrays, cycling over the iteration index if the maximum window length is reached.

        The exact signature of this function should match the one of the callbacks defined in
        :class:`thelper.train.base.Trainer` and specified by ``thelper.typedefs.IterCallbackParams``.
        """
        assert len(kwargs) == 0, "unexpected extra arguments present in update call"
        assert iter_idx is not None and max_iters is not None and iter_idx < max_iters, \
            "bad iteration indices given to metric update function"
        curr_win_size = max_iters if self.max_win_size is None else min(self.max_win_size, max_iters)
        if self.correct is None or self.correct.size != curr_win_size:
            # each 'iteration' will have a corresponding bin with counts for that batch
            self.correct = np.zeros(curr_win_size, dtype=np.int64)
            self.total = np.zeros(curr_win_size, dtype=np.int64)
        curr_idx = iter_idx % curr_win_size
        if target is None or target.numel() == 0:
            # only accumulate results when groundtruth is available
            self.correct[curr_idx] = 0
            self.total[curr_idx] = 0
            return
        if task is not None and isinstance(task, thelper.tasks.Classification) and task.multi_label:
            assert pred.shape == target.shape, "prediction/gt tensors dim/shape mismatch"
            assert self.top_k == 1, "unexpected top k value for multi-label accuracy eval"
            self.correct[curr_idx] = np.equal((pred > 0.5).long(), target).cpu().numpy().sum(dtype=np.int64)
        else:
            assert pred.dim() == target.dim() + 1, "prediction/gt tensors dim mismatch (should be BxCx[...] and Bx[...])"
            assert pred.shape[0] == target.shape[0], "prediction/gt tensors batch size mismatch"
            assert pred.dim() <= 2 or pred.shape[2:] == target.shape[1:], "prediction/gt tensors array size mismatch"
            top_k = pred.topk(self.top_k, dim=1)[1].view(pred.shape[0], self.top_k, -1).cpu().numpy()
            true_k = target.view(target.shape[0], 1, -1).expand(-1, self.top_k, -1).cpu().numpy()
            self.correct[curr_idx] = np.any(np.equal(top_k, true_k), axis=1).sum(dtype=np.int64)
        self.total[curr_idx] = target.numel()

[docs]    def eval(self):
        """Returns the current accuracy (in percentage) based on the accumulated prediction counts.

        Will issue a warning if no predictions have been accumulated yet.
        """
        if self.total is None or self.total.size == 0 or np.sum(self.total) == 0:
            if not self.warned_eval_bad:
                self.warned_eval_bad = True
                logger.warning("category accuracy eval result invalid (set as 0.0), no results accumulated")
            return 0.0
        return (float(np.sum(self.correct)) / float(np.sum(self.total))) * 100

[docs]    def reset(self):
        """Toggles a reset of the metric's internal state, deallocating count arrays."""
        self.correct = None
        self.total = None

    @property
    def goal(self):
        """Returns the scalar optimization goal of this metric (maximization)."""
        return Metric.maximize


[docs]@thelper.concepts.regression
class MeanAbsoluteError(Metric):
    r"""Mean absolute error metric interface.

    This is a scalar metric used to monitor the mean absolute deviation (or error) for a model's
    predictions. This regression metric can be described as:

    .. math::
        e(x, y) = E = \{e_1,\dots,e_N\}^\top, \quad
        e_n = \left| x_n - y_n \right|,

    where :math:`N` is the batch size. If ``reduction`` is not ``'none'``, then:

    .. math::
        \text{MAE}(x, y) =
        \begin{cases}
            \operatorname{mean}(E), & \text{if reduction } = \text{mean.}\\
            \operatorname{sum}(E),  & \text{if reduction } = \text{sum.}
        \end{cases}

    `x` and `y` are tensors of arbitrary shapes with a total of `n` elements each.

    Usage example inside a session configuration file::

        # ...
        # lists all metrics to instantiate as a dictionary
        "metrics": {
            # ...
            # this is the name of the example metric; it is used for lookup/printing only
            "mae": {
                # this type is used to instantiate the error metric
                "type": "thelper.optim.metrics.MeanAbsoluteError",
                "params": {
                    "reduction": "mean"
                }
            },
            # ...
        }
        # ...

    Todo: add support for 'dont care' target value?

    Attributes:
        max_win_size: maximum moving average window size to use (default=None, which equals dataset size).
        reduction: string representing the tensor reduction strategy to use.
        errors: array of error values stored for window-based averaging.
        warned_eval_bad: toggles whether the division-by-zero warning has been flagged or not.
    """

[docs]    def __init__(self, reduction="mean", max_win_size=None):
        """Receives the reduction strategy and the moving average window size (``window_size``).

        Note that by default, if ``max_win_size`` is not provided here, the value given to ``max_iters`` on
        the first update call will be used instead to fix the sliding window length. In any case, the
        smallest of ``max_iters`` and ``max_win_size`` will be used to determine the actual window size.
        """
        assert max_win_size is None or (isinstance(max_win_size, int) and max_win_size > 0), \
            "invalid max sliding window size (should be positive integer)"
        assert reduction != "none", "metric must absolutely return a scalar, must reduce"
        self.reduction = reduction
        self.max_win_size = max_win_size
        self.errors = None  # will be instantiated on first iter
        self.warned_eval_bad = False

    def __repr__(self):
        """Returns a generic print-friendly string containing info about this metric."""
        return self.__class__.__module__ + "." + self.__class__.__qualname__ + \
            f"(reduction={repr(self.reduction)}, max_win_size={repr(self.max_win_size)})"

[docs]    def update(self,         # see `thelper.typedefs.IterCallbackParams` for more info
               task,         # type: thelper.tasks.utils.Task
               input,        # type: thelper.typedefs.InputType
               pred,         # type: thelper.typedefs.RegressionPredictionType
               target,       # type: thelper.typedefs.RegressionTargetType
               sample,       # type: thelper.typedefs.SampleType
               loss,         # type: Optional[float]
               iter_idx,     # type: int
               max_iters,    # type: int
               epoch_idx,    # type: int
               max_epochs,   # type: int
               output_path,  # type: AnyStr
               **kwargs,     # type: Any
               ):            # type: (...) -> None
        """Receives the latest predictions and target values from the training session.

        This function computes and accumulates the L1 distance between predictions and targets in the
        internal array, cycling over the iteration index if the maximum window length is reached.

        The exact signature of this function should match the one of the callbacks defined in
        :class:`thelper.train.base.Trainer` and specified by ``thelper.typedefs.IterCallbackParams``.
        """
        assert len(kwargs) == 0, "unexpected extra arguments present in update call"
        assert iter_idx is not None and max_iters is not None and iter_idx < max_iters, \
            "bad iteration indices given to metric update function"
        curr_win_size = max_iters if self.max_win_size is None else min(self.max_win_size, max_iters)
        if self.errors is None or self.errors.size != curr_win_size:
            # each 'iteration' will have a corresponding bin with the average L1 loss for that batch
            self.errors = np.asarray([None] * curr_win_size)
        curr_idx = iter_idx % curr_win_size
        if target is None or target.numel() == 0:
            # only accumulate results when groundtruth is available
            self.errors[curr_idx] = None
            return
        assert pred.shape == target.shape, "prediction/gt tensors shape mismatch"
        self.errors[curr_idx] = torch.nn.functional.l1_loss(pred, target, reduction=self.reduction).item()

[docs]    def eval(self):
        """Returns the current (average) mean absolute error based on the accumulated values.

        Will issue a warning if no predictions have been accumulated yet.
        """
        if self.errors is None or self.errors.size == 0 or len([d for d in self.errors if d is not None]) == 0:
            if not self.warned_eval_bad:
                self.warned_eval_bad = True
                logger.warning("mean absolute error eval result invalid (set as 0.0), no results accumulated")
            return 0.0
        return np.mean([d for d in self.errors if d is not None])

[docs]    def reset(self):
        """Toggles a reset of the metric's internal state, deallocating the errors array."""
        self.errors = None

    @property
    def goal(self):
        """Returns the scalar optimization goal of this metric (minimization)."""
        return Metric.minimize


[docs]@thelper.concepts.regression
class MeanSquaredError(Metric):
    r"""Mean squared error metric interface.

    This is a scalar metric used to monitor the mean squared deviation (or error) for a model's
    predictions. This regression metric can be described as:

    .. math::
        e(x, y) = E = \{e_1,\dots,e_N\}^\top, \quad
        e_n = \left( x_n - y_n \right)^2,

    where :math:`N` is the batch size. If ``reduction`` is not ``'none'``, then:

    .. math::
        \text{MSE}(x, y) =
        \begin{cases}
            \operatorname{mean}(E), & \text{if reduction } = \text{mean.}\\
            \operatorname{sum}(E),  & \text{if reduction } = \text{sum.}
        \end{cases}

    `x` and `y` are tensors of arbitrary shapes with a total of `n` elements each.

    Usage example inside a session configuration file::

        # ...
        # lists all metrics to instantiate as a dictionary
        "metrics": {
            # ...
            # this is the name of the example metric; it is used for lookup/printing only
            "mse": {
                # this type is used to instantiate the error metric
                "type": "thelper.optim.metrics.MeanSquaredError",
                "params": {
                    "reduction": "mean"
                }
            },
            # ...
        }
        # ...

    Todo: add support for 'dont care' target value?

    Attributes:
        max_win_size: maximum moving average window size to use (default=None, which equals dataset size).
        reduction: string representing the tensor reduction strategy to use.
        errors: array of error values stored for window-based averaging.
        warned_eval_bad: toggles whether the division-by-zero warning has been flagged or not.
    """

[docs]    def __init__(self, reduction="mean", max_win_size=None):
        """Receives the reduction strategy and the moving average window size (``window_size``).

        Note that by default, if ``max_win_size`` is not provided here, the value given to ``max_iters`` on
        the first update call will be used instead to fix the sliding window length. In any case, the
        smallest of ``max_iters`` and ``max_win_size`` will be used to determine the actual window size.
        """
        assert max_win_size is None or (isinstance(max_win_size, int) and max_win_size > 0), \
            "invalid max sliding window size (should be positive integer)"
        assert reduction != "none", "metric must absolutely return a scalar, must reduce"
        self.reduction = reduction
        self.max_win_size = max_win_size
        self.errors = None  # will be instantiated on first iter
        self.warned_eval_bad = False

    def __repr__(self):
        """Returns a generic print-friendly string containing info about this metric."""
        return self.__class__.__module__ + "." + self.__class__.__qualname__ + \
            f"(reduction={repr(self.reduction)}, max_win_size={repr(self.max_win_size)})"

[docs]    def update(self,         # see `thelper.typedefs.IterCallbackParams` for more info
               task,         # type: thelper.tasks.utils.Task
               input,        # type: thelper.typedefs.InputType
               pred,         # type: thelper.typedefs.RegressionPredictionType
               target,       # type: thelper.typedefs.RegressionTargetType
               sample,       # type: thelper.typedefs.SampleType
               loss,         # type: Optional[float]
               iter_idx,     # type: int
               max_iters,    # type: int
               epoch_idx,    # type: int
               max_epochs,   # type: int
               output_path,  # type: AnyStr
               **kwargs,     # type: Any
               ):            # type: (...) -> None
        """Receives the latest predictions and target values from the training session.

        This function computes and accumulates the mean squared error between predictions and targets in
        the internal array, cycling over the iteration index if the maximum window length is reached.

        The exact signature of this function should match the one of the callbacks defined in
        :class:`thelper.train.base.Trainer` and specified by ``thelper.typedefs.IterCallbackParams``.
        """
        assert len(kwargs) == 0, "unexpected extra arguments present in update call"
        assert iter_idx is not None and max_iters is not None and iter_idx < max_iters, \
            "bad iteration indices given to metric update function"
        curr_win_size = max_iters if self.max_win_size is None else min(self.max_win_size, max_iters)
        if self.errors is None or self.errors.size != curr_win_size:
            # each 'iteration' will have a corresponding bin with the average MSE loss for that batch
            self.errors = np.asarray([None] * curr_win_size)
        curr_idx = iter_idx % curr_win_size
        if target is None or target.numel() == 0:
            # only accumulate results when groundtruth is available
            self.errors[curr_idx] = None
            return
        assert pred.shape == target.shape, "prediction/gt tensors shape mismatch"
        self.errors[curr_idx] = torch.nn.functional.mse_loss(pred, target, reduction=self.reduction).item()

[docs]    def eval(self):
        """Returns the current (average) mean squared error based on the accumulated values.

        Will issue a warning if no predictions have been accumulated yet.
        """
        if self.errors is None or self.errors.size == 0 or len([d for d in self.errors if d is not None]) == 0:
            if not self.warned_eval_bad:
                self.warned_eval_bad = True
                logger.warning("mean squared error eval result invalid (set as 0.0), no results accumulated")
            return 0.0
        return np.mean([d for d in self.errors if d is not None])

[docs]    def reset(self):
        """Toggles a reset of the metric's internal state, deallocating the errors array."""
        self.errors = None

    @property
    def goal(self):
        """Returns the scalar optimization goal of this metric (minimization)."""
        return Metric.minimize


[docs]@thelper.concepts.classification
@thelper.concepts.segmentation
class ExternalMetric(Metric, ClassNamesHandler):
    r"""External metric wrapping interface.

    This interface is used to wrap external metrics and use them in the training framework. The metrics
    of ``sklearn.metrics`` are good candidates that have been used extensively with this interface in
    the past, but those of other libraries might also be compatible.

    Along with the name of the class to import and its constructor's parameters, the user must provide
    a handling mode that specifies how prediction and groundtruth data should be handled in this wrapper.
    Also, extra arguments such as target label names, goal information, and window sizes can be provided
    for specific use cases related to the selected handling mode.

    For now, two metric handling modes (both related to classification) are supported:

      * ``classif_best``: the wrapper will accumulate the predicted and groundtruth classification \
        labels forwarded by the trainer and provide them to the external metric for evaluation. If \
        a target label name is specified, then only classifications related to that label will be \
        accumulated. This is the handling mode required for count-based classification metrics such \
        as accuracy, F-Measure, precision, recall, etc.

      * ``classif_score``: the wrapper will accumulate the prediction score of the targeted label \
        along with a boolean that indicates whether this label was the groundtruth label or not. This \
        is the handling mode required for score-based classification metrics such as when computing \
        the area under the ROC curve (AUC).

    Usage examples inside a session configuration file::

        # ...
        # lists all metrics to instantiate as a dictionary
        "metrics": {
            # ...
            # this is the name of the first example metric; it is used for lookup/printing only
            "f1_score_reject": {
                # this type is used to instantiate the wrapper
                "type": "thelper.optim.metrics.ExternalMetric",
                # these parameters are passed to the wrapper's constructor
                "params": {
                    # the external class to import
                    "metric_name": "sklearn.metrics.f1_score",
                    # the parameters passed to the external class's constructor
                    "metric_params": {},
                    # the wrapper metric handling mode
                    "metric_type": "classif_best",
                    # the target class name (note: dataset-specific)
                    "target_name": "reject",
                    # the goal type of the external metric
                    "metric_goal": "max"
                }
            },
            # this is the name of the second example metric; it is used for lookup/printing only
            "roc_auc_accept": {
                # this type is used to instantiate the wrapper
                "type": "thelper.optim.metrics.ExternalMetric",
                # these parameters are passed to the wrapper's constructor
                "params": {
                    # the external class to import
                    "metric_name": "sklearn.metrics.roc_auc_score",
                    # the parameters passed to the external class's constructor
                    "metric_params": {},
                    # the wrapper metric handling mode
                    "metric_type": "classif_score",
                    # the target class name (note: dataset-specific)
                    "target_name": "accept",
                    # the goal type of the external metric
                    "metric_goal": "max"
                }
            },
            # ...
        }
        # ...

    Attributes:
        metric_goal: goal of the external metric, used for monitoring. Can be ``min`` or ``max``.
        metric_type: handling mode of the external metric. Can only be one of the predetermined values.
        metric: type of the external metric that will be instantiated when ``eval`` is called.
        metric_params: dictionary of parameters passed to the external metric on instantiation.
        target_name: name of the targeted label. Used only in handling modes related to classification.
        target_idx: index of the targeted label. Used only in handling modes related to classification.
        class_names: holds the list of class label names provided by the dataset parser. If it is not
            provided when the constructor is called, it will be set by the trainer at runtime.
        force_softmax: specifies whether a softmax operation should be applied to the prediction scores
            obtained from the trainer. Only used with the "classif_score" handling mode.
        max_win_size: maximum moving average window size to use (default=None, which equals dataset size).
        pred: queue used to store predictions-related values for window-based averaging.
        target: queue used to store groundtruth-related values for window-based averaging.
    """

[docs]    def __init__(self, metric_name, metric_type, metric_goal, metric_params=None, target_name=None,
                 class_names=None, max_win_size=None, force_softmax=True, live_eval=True):
        """Receives all necessary arguments for wrapper initialization and external metric instantiation.

        See :class:`thelper.optim.metrics.ExternalMetric` for information on arguments.
        """
        assert isinstance(metric_name, str), "metric_name must be fully qualifiied class name to import"
        assert metric_params is None or isinstance(metric_params, dict), "metric_params must be dictionary"
        supported_handling_types = [
            "classif_top1", "classif_best",  # the former is for backwards-compat with the latter
            "classif_scores", "classif_score",  # the former is for backwards-compat with the latter
            "regression",  # missing impl, work in progress @@@ TODO
        ]
        assert isinstance(metric_type, str) and metric_type in supported_handling_types, \
            f"unknown metric type {repr(metric_type)}"
        if metric_type == "classif_top1":
            metric_type = "classif_best"  # they are identical, just overwrite for backwards compat
        if metric_type == "classif_scores":
            metric_type = "classif_score"  # they are identical, just overwrite for backwards compat
        assert metric_goal is not None and metric_goal in ["max", "min"], "unexpected goal type"
        self.metric_goal = Metric.maximize if metric_goal == "max" else Metric.minimize
        self.metric_type = metric_type
        self.metric_name = metric_name
        self.metric = thelper.utils.import_class(metric_name)
        self.metric_params = metric_params if metric_params is not None else {}
        self.target_name = target_name
        self.target_idx = None
        self.force_softmax = None
        if metric_type == "classif_score":
            self.force_softmax = force_softmax  # only useful in this case
        # elif "regression" in metric_type: missing impl for custom handling @@@
        assert max_win_size is None or (isinstance(max_win_size, int) and max_win_size > 0), \
            "invalid max sliding window size (should be positive integer)"
        self.max_win_size = max_win_size
        self.pred = None  # will be instantiated on first iter
        self.target = None  # will be instantiated on first iter
        self._live_eval = live_eval  # could be 'False' for external impls that are pretty slow to eval
        ClassNamesHandler.__init__(self, class_names)

    def __repr__(self):
        """Returns a generic print-friendly string containing info about this metric."""
        return self.__class__.__module__ + "." + self.__class__.__qualname__ + \
            f"(metric_name={repr(self.metric_name)}, metric_type={repr(self.metric_type)}, " + \
            f"metric_goal={'min' if self.goal == Metric.minimize else 'max'}, " + \
            f"metric_params={repr(self.metric_params)}, target_name={repr(self.target_name)}, " + \
            f"class_names={repr(self.class_names)}, max_win_size={repr(self.max_win_size)}, " + \
            f"force_softmax={repr(self.force_softmax)})"

    @ClassNamesHandler.class_names.setter
    def class_names(self, class_names):
        """Sets the class label names that must be predicted by the model.

        This is only useful in metric handling modes related to classification. The goal of having
        class names here is to translate a target class label (provided in the constructor) into a
        target class index. This is required as predictions are not mapped to their original names
        (in string format) before being forwarded to this object by the trainer.
        """
        if "classif" in self.metric_type:
            ClassNamesHandler.class_names.fset(self, class_names)
            if self.target_name is not None and self.class_names is not None:
                assert self.target_name in self.class_indices, \
                    f"could not find target name {repr(self.target_name)} in class names list"
                self.target_idx = self.class_indices[self.target_name]
            else:
                self.target_idx = None

[docs]    def update(self,         # see `thelper.typedefs.IterCallbackParams` for more info
               task,         # type: thelper.tasks.utils.Task
               input,        # type: thelper.typedefs.InputType
               pred,         # type: thelper.typedefs.AnyTargetType
               target,       # type: thelper.typedefs.AnyPredictionType
               sample,       # type: thelper.typedefs.SampleType
               loss,         # type: Optional[float]
               iter_idx,     # type: int
               max_iters,    # type: int
               epoch_idx,    # type: int
               max_epochs,   # type: int
               output_path,  # type: AnyStr
               **kwargs,     # type: Any
               ):            # type: (...) -> None
        """Receives the latest predictions and target values from the training session.

        The handling of the data received here will depend on the current metric's handling mode.

        The exact signature of this function should match the one of the callbacks defined in
        :class:`thelper.train.base.Trainer` and specified by ``thelper.typedefs.IterCallbackParams``.
        """
        assert len(kwargs) == 0, "unexpected extra arguments present in update call"
        assert iter_idx is not None and max_iters is not None and iter_idx < max_iters, \
            "bad iteration indices given to metric update function"
        curr_win_size = max_iters if self.max_win_size is None else min(self.max_win_size, max_iters)
        if self.pred is None or self.pred.size != curr_win_size:
            # each 'iteration' will have a corresponding bin with counts for that batch
            self.pred = np.asarray([None] * curr_win_size)
            self.target = np.asarray([None] * curr_win_size)
        curr_idx = iter_idx % curr_win_size
        if "classif" in self.metric_type:
            if hasattr(task, "class_names") and task.class_names != self.class_names:
                self.class_names = task.class_names
            if target is None or target.numel() == 0:
                # only accumulate results when groundtruth is available
                self.pred[curr_idx] = None
                self.target[curr_idx] = None
                return
            assert self.target_name is None or self.target_idx is not None, \
                f"could not map target name '{self.target_name}' to target idx, missing class list"
            assert pred.shape[0] == target.shape[0], "prediction/gt tensors batch size mismatch"
            if self.target_idx is not None:
                y_true, y_pred = [], []
                if self.metric_type == "classif_best":
                    assert pred.dim() == 2 and target.dim() == 1, "current ext metric implementation only supports batched 1D outputs"
                    pred_label = pred.topk(1, dim=1)[1].view(pred.shape[0])
                    assert pred_label.numel() == target.numel(), "pred/target classification element count mismatch"
                    must_keep = [y_pred == self.target_idx or y_true == self.target_idx for y_pred, y_true in zip(pred_label, target)]
                    for idx, keep in enumerate(must_keep):
                        if keep:
                            y_true.append(target[idx].item() == self.target_idx)
                            y_pred.append(pred_label[idx].item() == self.target_idx)
                else:  # self.metric_type == "classif_score"
                    if self.force_softmax:
                        with torch.no_grad():
                            pred = torch.nn.functional.softmax(pred, dim=1)
                    if pred.dim() == 2 and target.dim() == 1:
                        for idx, tgt in enumerate(target):
                            y_true.append(tgt.item() == self.target_idx)
                            y_pred.append(pred[idx, self.target_idx].item())
                    else:
                        assert pred.dim() > 2 and target.dim() == pred.dim() - 1 and pred.shape[2:] == target.shape[1:]
                        y_true = (target.reshape(-1) == self.target_idx).cpu().numpy()
                        y_pred = pred[:, self.target_idx, ...].reshape(-1).cpu().numpy()
                self.target[curr_idx] = y_true
                self.pred[curr_idx] = y_pred
            else:
                assert self.metric_type != "classif_score", "score-based classif analysis (e.g. roc auc) must specify target label"
                if self.metric_type == "classif_best":
                    self.target[curr_idx] = [target[idx].item() for idx in range(pred.numel())]
                    self.pred[curr_idx] = [pred[idx].item() for idx in range(pred.numel())]
        else:  # if self.metric_type == "regression":
            raise NotImplementedError

[docs]    def eval(self):
        """Returns the external metric's evaluation result."""
        if "classif" in self.metric_type:
            assert self.target.size == self.pred.size, "internal window size mismatch"
            pred, target = zip(*[(pred, target) for preds, targets in zip(self.pred, self.target)
                                 if targets is not None for pred, target in zip(preds, targets)])
            return self.metric(np.stack(target, axis=0), np.stack(pred, axis=0), **self.metric_params)
        else:  # if self.metric_type == "regression":
            raise NotImplementedError

[docs]    def reset(self):
        """Toggles a reset of the metric's internal state, emptying pred/target queues."""
        self.pred = None
        self.target = None

    @property
    def goal(self):
        """Returns the scalar optimization goal of this metric (user-defined)."""
        return self.metric_goal

    @property
    def live_eval(self):
        """Returns whether this metric can/should be evaluated at every backprop iteration or not.

        By default, this returns ``True``, but implementations that are quite slow may return ``False``.
        """
        return self._live_eval


[docs]@thelper.concepts.classification
@thelper.concepts.segmentation
class ROCCurve(Metric, ClassNamesHandler):
    """Receiver operating characteristic (ROC) computation interface.

    This class provides an interface to ``sklearn.metrics.roc_curve`` and ``sklearn.metrics.roc_auc_score``
    that can produce various types of ROC-related information including the area under the curve (AUC), the
    false positive and negative rates for various operating points, and the ROC curve itself as an image
    (also compatible with tensorboardX).

    By default, evaluating this metric returns the Area Under the Curve (AUC). If a target operating point is
    set, it will instead return the false positive/negative prediction rate of the model at that point.

    Usage examples inside a session configuration file::

        # ...
        # lists all metrics to instantiate as a dictionary
        "metrics": {
            # ...
            # this is the name of the first example; it will output the AUC of the "reject" class
            "roc_reject_auc": {
                # this type is used to instantiate the ROC metric
                "type": "thelper.optim.metrics.ROCCurve",
                # these parameters are passed to the constructor
                "params": {
                    # the name of the class to evaluate
                    "target_name": "reject"
                }
            },
            # this is the name of the second example; it will output the FPR at TPR=0.99
            "roc_reject_0.99tpr": {
                # this type is used to instantiate the ROC metric
                "type": "thelper.optim.metrics.ROCCurve",
                # these parameters are passed to the constructor
                "params": {
                    # the name of the class to evaluate
                    "target_name": "reject",
                    # the target true positive rate (TPR) operating point
                    "target_tpr": 0.99
                }
            },
            # ...
        }
        # ...

    Attributes:
        target_inv: used to target all classes except the named one(s); experimental!
        target_name: name of targeted class to generate the roc curve/auc information for.
        target_tpr: target operating point in terms of true positive rate (provided in constructor).
        target_fpr: target operating point in terms of false positive rate (provided in constructor).
        target_idx: index of the targeted class, mapped from target_name using the class_names list.
        class_names: holds the list of class label names provided by the dataset parser. If it is not
            provided when the constructor is called, it will be set by the trainer at runtime.
        force_softmax: specifies whether a softmax operation should be applied to the prediction scores
            obtained from the trainer.
        curve: roc curve generator function, called at evaluation time to generate the output string.
        auc: auc score generator function, called at evaluation time to generate the output string.
        score: queue used to store prediction score values for window-based averaging.
        true: queue used to store groundtruth label values for window-based averaging.
    """

[docs]    def __init__(self, target_name, target_tpr=None, target_fpr=None, class_names=None,
                 force_softmax=True, sample_weight=None, drop_intermediate=True):
        """Receives the target class/operating point info, log parameters, and roc computation arguments.

        Args:
            target_name: name of targeted class to generate the roc curve/auc information for.
            target_tpr: target operating point in terms of true positive rate (provided in constructor).
            target_fpr: target operating point in terms of false positive rate (provided in constructor).
            class_names: holds the list of class label names provided by the dataset parser. If it is not
                provided when the constructor is called, it will be set by the trainer at runtime.
            force_softmax: specifies whether a softmax operation should be applied to the prediction scores
                obtained from the trainer.
            sample_weight: passed to ``sklearn.metrics.roc_curve`` and ``sklearn.metrics.roc_auc_score``.
            drop_intermediate: passed to ``sklearn.metrics.roc_curve``.
        """
        assert target_name is not None, "must provide a target (class) name for ROC metric"
        self.target_inv = False
        if isinstance(target_name, str) and target_name[0] == "!":
            self.target_inv = True
            self.target_name = target_name.split("!", 1)[1]
        else:
            self.target_name = target_name
        self.target_tpr, self.target_fpr = None, None
        assert target_tpr is None or target_fpr is None, "must specify only one of target_fpr and target_tpr, not both"
        if target_tpr is not None or target_fpr is not None:
            target_xpr = target_tpr if target_tpr is not None else target_fpr
            assert isinstance(target_xpr, float), "expected float type for target operating point"
            assert 0 <= target_xpr <= 1, "invalid target operation point value (must be in [0,1])"
            if target_tpr is not None:
                self.target_tpr = target_tpr
            else:  # if target_fpr is not None
                self.target_fpr = target_fpr
        self.target_idx = None
        self.force_softmax = force_softmax
        self.sample_weight = sample_weight
        self.drop_intermediate = drop_intermediate

        def gen_curve(y_true, y_score, _target_idx, _target_inv, _sample_weight=sample_weight, _drop_intermediate=drop_intermediate):
            assert _target_idx is not None, "missing positive target idx at run time"
            _y_true, _y_score = [], []
            for sample_idx, label_idx in enumerate(y_true):
                _y_true.append(label_idx != _target_idx if _target_inv else label_idx == _target_idx)
                _y_score.append(1 - y_score[sample_idx, _target_idx] if _target_inv else y_score[sample_idx, _target_idx])
            res = sklearn.metrics.roc_curve(_y_true, _y_score, sample_weight=_sample_weight, drop_intermediate=_drop_intermediate)
            return res

        def gen_auc(y_true, y_score, _target_idx, _target_inv, _sample_weight=sample_weight):
            assert _target_idx is not None, "missing positive target idx at run time"
            _y_true, _y_score = [], []
            for sample_idx, label_idx in enumerate(y_true):
                _y_true.append(label_idx != _target_idx if _target_inv else label_idx == _target_idx)
                _y_score.append(1 - y_score[sample_idx, _target_idx] if _target_inv else y_score[sample_idx, _target_idx])
            res = sklearn.metrics.roc_auc_score(_y_true, _y_score, sample_weight=_sample_weight)
            return res

        self.curve = gen_curve
        self.auc = gen_auc
        self.score = None
        self.true = None
        ClassNamesHandler.__init__(self, class_names)

    def __repr__(self):
        """Returns a generic print-friendly string containing info about this metric."""
        return self.__class__.__module__ + "." + self.__class__.__qualname__ + \
            f"(target_name={repr(self.target_name)}, target_tpr={repr(self.target_tpr)}, " + \
            f"target_fpr={repr(self.target_fpr)}, class_names={repr(self.class_names)}, " + \
            f"force_softmax={repr(self.force_softmax)}, sample_weight={repr(self.sample_weight)}, " + \
            f"drop_intermediate={repr(self.drop_intermediate)})"

    @ClassNamesHandler.class_names.setter
    def class_names(self, class_names):
        """Sets the class label names that must be predicted by the model."""
        ClassNamesHandler.class_names.fset(self, class_names)
        if self.target_name is not None and self.class_names is not None:
            assert self.target_name in self.class_indices, \
                f"could not find target name {repr(self.target_name)} in class names list"
            self.target_idx = self.class_indices[self.target_name]
        else:
            self.target_idx = None

[docs]    def update(self,         # see `thelper.typedefs.IterCallbackParams` for more info
               task,         # type: thelper.tasks.utils.Task
               input,        # type: thelper.typedefs.InputType
               pred,         # type: thelper.typedefs.AnyPredictionType
               target,       # type: thelper.typedefs.AnyTargetType
               sample,       # type: thelper.typedefs.SampleType
               loss,         # type: Optional[float]
               iter_idx,     # type: int
               max_iters,    # type: int
               epoch_idx,    # type: int
               max_epochs,   # type: int
               output_path,  # type: AnyStr
               **kwargs,     # type: Any
               ):            # type: (...) -> None
        """Receives the latest predictions and target values from the training session.

        The exact signature of this function should match the one of the callbacks defined in
        :class:`thelper.train.base.Trainer` and specified by ``thelper.typedefs.IterCallbackParams``.
        """
        assert len(kwargs) == 0, "unexpected extra arguments present in update call"
        assert isinstance(task, thelper.tasks.Classification), "roc curve only impl for classif tasks"
        assert not task.multi_label, "roc curve only impl for non-multi-label classif tasks"
        assert iter_idx is not None and max_iters is not None and iter_idx < max_iters, \
            "bad iteration indices given to metric update function"
        if self.score is None or self.score.size != max_iters:
            self.score = np.asarray([None] * max_iters)
            self.true = np.asarray([None] * max_iters)
        if task.class_names != self.class_names:
            self.class_names = task.class_names
        if target is None or target.numel() == 0:
            # only accumulate results when groundtruth is available
            self.score[iter_idx] = None
            self.true[iter_idx] = None
            return
        assert pred.dim() == 2 or target.dim() == 1, "current classif report impl only supports batched 1D outputs"
        assert pred.shape[0] == target.shape[0], "prediction/gt tensors batch size mismatch"
        assert pred.shape[1] == len(self.class_names), "unexpected prediction class dimension size"
        if self.force_softmax:
            with torch.no_grad():
                pred = torch.nn.functional.softmax(pred, dim=1)
        self.score[iter_idx] = pred.numpy()
        self.true[iter_idx] = target.numpy()

[docs]    def eval(self):
        """Returns the evaluation result (AUC/TPR/FPR).

        If no target operating point is set, the returned value is the AUC for the target class. If a
        target TPR is set, the returned value is the FPR for that operating point. If a target FPR is set,
        the returned value is the TPR for that operating point.
        """
        if self.score is None or self.true is None:
            return None
        score, true = zip(*[(score, true) for scores, trues in zip(self.score, self.true)
                            if trues is not None for score, true in zip(scores, trues)])
        # if we did not specify a target operating point in terms of true/false positive rate, return AUC
        if self.target_tpr is None and self.target_fpr is None:
            return self.auc(np.stack(true, axis=0), np.stack(score, axis=0), self.target_idx, self.target_inv)
        # otherwise, find the opposite rate at the requested target operating point
        _fpr, _tpr, _thrs = self.curve(np.stack(true, axis=0), np.stack(score, axis=0), self.target_idx,
                                       self.target_inv, _drop_intermediate=False)
        for fpr, tpr, thrs in zip(_fpr, _tpr, _thrs):
            if self.target_tpr is not None and tpr >= self.target_tpr:
                # print("for target tpr = %.5f, fpr = %.5f at threshold = %f" % (self.target_tpr, fpr, thrs))
                return fpr
            elif self.target_fpr is not None and fpr >= self.target_fpr:
                # print("for target fpr = %.5f, tpr = %.5f at threshold = %f" % (self.target_fpr, tpr, thrs))
                return tpr
        # if we did not find a proper rate match above, return worse possible value
        if self.target_tpr is not None:
            # print("for target tpr = %.5f, fpr = 1.0 at threshold = min" % self.target_tpr)
            return 1.0
        else:  # if self.target_fpr is not None:
            # print("for target fpr = %.5f, tpr = 0.0 at threshold = max" % self.target_fpr)
            return 0.0

[docs]    def render(self):
        """Returns the ROC curve as a numpy-compatible RGBA image drawn by pyplot."""
        if self.score is None:
            return None
        score, true = zip(*[(score, true) for scores, trues in zip(self.score, self.true)
                            if trues is not None for score, true in zip(scores, trues)])
        fpr, tpr, t = self.curve(np.stack(true, axis=0), np.stack(score, axis=0), self.target_idx, self.target_inv)
        try:
            fig, ax = thelper.draw.draw_roc_curve(fpr, tpr)
            array = thelper.draw.fig2array(fig)
            return array
        except AttributeError as e:
            logger.warning(f"failed to render roc curve; caught exception:\n{str(e)}")
            # return None if rendering fails (probably due to matplotlib on displayless server)
            return None

[docs]    def reset(self):
        """Toggles a reset of the metric's internal state, emptying queues."""
        self.score = None
        self.true = None

    @property
    def goal(self):
        """Returns the scalar optimization goal of this metric (variable based on target op point)."""
        # if we did not specify a target operating point in terms of true/false positive rate, return AUC
        if self.target_tpr is None and self.target_fpr is None:
            return Metric.maximize  # AUC must be maximized
        if self.target_tpr is not None:
            return Metric.minimize  # fpr must be minimized
        else:  # if self.target_fpr is not None:
            return Metric.maximize  # tpr must be maximized

    @property
    def live_eval(self):
        """Returns whether this metric can/should be evaluated at every backprop iteration or not."""
        return False  # some operating modes might be pretty slow, check back impl later


[docs]@thelper.concepts.regression
class PSNR(Metric):
    r"""Peak Signal-to-Noise Ratio (PSNR) metric interface.

    This is a scalar metric used to monitor the change in quality of a signal (or image) following a
    transformation. For more information, see its definition on `[Wikipedia]`__.

    .. __: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio

    The PSNR (in decibels, dB) between a modified signal :math:`x` and its original version :math:`y` is
    defined as:

    .. math::
        \text{PSNR}(x, y) = 10 * \log_{10} \Bigg( \frac{R^2}{\text{MSE}(x, y)} \Bigg)

    where :math:`\text{MSE}(x, y)` returns the mean squared error (see :class:`thelper.optim.metrics.MeanSquaredError`
    for more information), and :math:`R` is the maximum possible value for a single element in the input signal
    (i.e. its maximum "range").

    Usage example inside a session configuration file::

        # ...
        # lists all metrics to instantiate as a dictionary
        "metrics": {
            # ...
            # this is the name of the example metric; it is used for lookup/printing only
            "psnr": {
                # this type is used to instantiate the metric
                "type": "thelper.optim.metrics.PSNR",
                "params": {
                    "data_range": "255"
                }
            },
            # ...
        }
        # ...

    Attributes:
        max_win_size: maximum moving average window size to use (default=None, which equals dataset size).
        data_range: maximum value of an element in the target signal.
        psnrs: array of psnr values stored for window-based averaging.
        warned_eval_bad: toggles whether the division-by-zero warning has been flagged or not.
    """

[docs]    def __init__(self, data_range=1.0, max_win_size=None):
        """Receives all necessary initialization arguments to compute signal PSNRs,

        See :class:`thelper.optim.metrics.PSNR` for information on arguments.
        """
        self.max_win_size = max_win_size
        self.psnrs = None  # will be instantiated on first iter
        self.warned_eval_bad = False
        self.data_range = data_range

    def __repr__(self):
        """Returns a generic print-friendly string containing info about this metric."""
        return self.__class__.__module__ + "." + self.__class__.__qualname__ + \
            f"(data_range={repr(self.data_range)}, max_win_size={repr(self.max_win_size)})"

[docs]    def update(self,         # see `thelper.typedefs.IterCallbackParams` for more info
               task,         # type: thelper.tasks.utils.Task
               input,        # type: thelper.typedefs.InputType
               pred,         # type: thelper.typedefs.RegressionPredictionType
               target,       # type: thelper.typedefs.RegressionTargetType
               sample,       # type: thelper.typedefs.SampleType
               loss,         # type: Optional[float]
               iter_idx,     # type: int
               max_iters,    # type: int
               epoch_idx,    # type: int
               max_epochs,   # type: int
               output_path,  # type: AnyStr
               **kwargs,     # type: Any
               ):            # type: (...) -> None
        """Receives the latest predictions and target values from the training session.

        The exact signature of this function should match the one of the callbacks defined in
        :class:`thelper.train.base.Trainer` and specified by ``thelper.typedefs.IterCallbackParams``.
        """
        assert len(kwargs) == 0, "unexpected extra arguments present in update call"
        assert iter_idx is not None and max_iters is not None and iter_idx < max_iters, \
            "bad iteration indices given to metric update function"
        curr_win_size = max_iters if self.max_win_size is None else min(self.max_win_size, max_iters)
        if self.psnrs is None or self.psnrs.size != curr_win_size:
            # each 'iteration' will have a corresponding bin with the psnr for that batch
            self.psnrs = np.asarray([None] * curr_win_size)
        curr_idx = iter_idx % curr_win_size
        if target is None or target.numel() == 0:
            # only accumulate results when groundtruth is available
            self.psnrs[curr_idx] = None
            return
        assert pred.shape == target.shape, "prediction/gt tensors shape mismatch"
        mse = np.mean(np.square(pred.numpy() - target.numpy()), dtype=np.float64)
        self.psnrs[curr_idx] = 10 * np.log10(self.data_range / mse)

[docs]    def eval(self):
        """Returns the current (average) PSNR based on the accumulated values.

        Will issue a warning if no predictions have been accumulated yet.
        """
        if self.psnrs is None or self.psnrs.size == 0 or len([v for v in self.psnrs if v is not None]) == 0:
            if not self.warned_eval_bad:
                self.warned_eval_bad = True
                logger.warning("psnr eval result invalid (set as 0.0), no results accumulated")
            return 0.0
        return np.mean([v for v in self.psnrs if v is not None])

[docs]    def reset(self):
        """Toggles a reset of the metric's internal state, deallocating the psnrs array."""
        self.psnrs = None

    @property
    def goal(self):
        """Returns the scalar optimization goal of this metric (maximization)."""
        return Metric.maximize


[docs]@thelper.concepts.detection
class AveragePrecision(Metric):
    r"""Object detection average precision score from PascalVOC.

    This metric is computed based on the evaluator function implemented in :mod:`thelper.optim.eval`.
    It can target a single class at a time, or produce the mean average precision for all classes.

    Usage example inside a session configuration file::

        # ...
        # lists all metrics to instantiate as a dictionary
        "metrics": {
            # ...
            # this is the name of the example metric; it is used for lookup/printing only
            "mAP": {
                # this type is used to instantiate the AP metric
                "type": "thelper.optim.metrics.AveragePrecision",
                # these parameters are passed to the wrapper's constructor
                "params": {
                    # no parameters means we will compute the mAP
                }
            },
            # ...
        }
        # ...

    Attributes:
        target_class: name of the class to target; if 'None', will compute mAP instead of AP.
        iou_threshold: Intersection Over Union (IOU) threshold for true/false positive classification.
        method: the evaluation method to use; can be the the latest & official PASCAL VOC toolkit
            approach ("all-points"), or the 11-point approach ("11-points") described in the original
            paper ("The PASCAL Visual Object Classes(VOC) Challenge").
        max_win_size: maximum moving average window size to use (default=None, which equals dataset size).
        preds: array holding the predicted bounding boxes for all input samples.
        targets: array holding the target bounding boxes for all input samples.
    """

[docs]    def __init__(self, target_class=None, iou_threshold=0.5, method="all-points", max_win_size=None):
        """Initializes metric attributes.

        Note that by default, if ``max_win_size`` is not provided here, the value given to ``max_iters`` on
        the first update call will be used instead to fix the sliding window length. In any case, the
        smallest of ``max_iters`` and ``max_win_size`` will be used to determine the actual window size.
        """
        assert max_win_size is None or (isinstance(max_win_size, int) and max_win_size > 0), \
            "invalid max sliding window size (should be positive integer)"
        self.target_class = target_class
        self.iou_threshold = iou_threshold
        self.method = method
        self.max_win_size = max_win_size
        self.preds = None  # will be instantiated on first iter
        self.targets = None  # will be instantiated on first iter
        self.task = None

    def __repr__(self):
        """Returns a generic print-friendly string containing info about this metric."""
        return self.__class__.__module__ + "." + self.__class__.__qualname__ + \
            f"(target_class={repr(self.target_class)}, max_win_size={repr(self.max_win_size)})"

[docs]    def update(self,         # see `thelper.typedefs.IterCallbackParams` for more info
               task,         # type: thelper.tasks.utils.Task
               input,        # type: thelper.typedefs.InputType
               pred,         # type: thelper.typedefs.DetectionPredictionType
               target,       # type: thelper.typedefs.DetectionTargetType
               sample,       # type: thelper.typedefs.SampleType
               loss,         # type: Optional[float]
               iter_idx,     # type: int
               max_iters,    # type: int
               epoch_idx,    # type: int
               max_epochs,   # type: int
               output_path,  # type: AnyStr
               **kwargs):    # type: (...) -> None
        """Receives the latest bbox predictions and targets from the training session.

        The exact signature of this function should match the one of the callbacks defined in
        :class:`thelper.train.base.Trainer` and specified by ``thelper.typedefs.IterCallbackParams``.
        """
        assert len(kwargs) == 0, "unexpected extra arguments present in update call"
        assert iter_idx is not None and max_iters is not None and iter_idx < max_iters, \
            "bad iteration indices given to metric update function"
        curr_win_size = max_iters if self.max_win_size is None else min(self.max_win_size, max_iters)
        if self.preds is None or self.preds.size != curr_win_size:
            # each 'iteration' will have a corresponding bin with counts for that batch
            self.preds = np.asarray([None] * curr_win_size)
            self.targets = np.asarray([None] * curr_win_size)
        curr_idx = iter_idx % curr_win_size
        self.task = task  # keep reference for eval only
        if target is None or len(target) == 0:
            # only accumulate results when groundtruth is available (should we though? affects false negative count)
            self.preds[curr_idx] = None
            self.targets[curr_idx] = None
            return
        if not pred:
            pred = [[]] * len(target)
        assert isinstance(pred, list) and isinstance(target, list)
        assert all([isinstance(b, list) and
                    all([isinstance(p, thelper.tasks.detect.BoundingBox) for p in b]) for b in pred])
        assert all([isinstance(b, list) and
                    all([isinstance(t, thelper.tasks.detect.BoundingBox) for t in b]) for b in target])
        self.preds[curr_idx] = pred
        self.targets[curr_idx] = target

[docs]    def eval(self):
        """Returns the current accuracy (in percentage) based on the accumulated prediction counts.

        Will issue a warning if no predictions have been accumulated yet.
        """
        assert self.targets.size == self.preds.size, "internal window size mismatch"
        pred, target = zip(*[(pred, target) for preds, targets in zip(self.preds, self.targets)
                             if targets for pred, target in zip(preds, targets)])
        # maybe need to concat?
        pred, target = np.concatenate(pred), np.concatenate(target)  # possible due to image ids
        if len(pred) == 0:  # no predictions made by model
            return float("nan")
        metrics = thelper.optim.eval.compute_pascalvoc_metrics(pred, target, self.task,
                                                               self.iou_threshold, self.method)
        if self.target_class is None:
            # compute mAP wrt classes that have at least one positive sample
            return np.mean([m["AP"] for m in metrics.values() if m["total positives"] > 0])
        return metrics[self.target_class]["AP"]

[docs]    def reset(self):
        """Toggles a reset of the metric's internal state, deallocating bbox arrays."""
        self.preds = None
        self.targets = None

    @property
    def goal(self):
        """Returns the scalar optimization goal of this metric (maximization)."""
        return Metric.maximize

    @property
    def live_eval(self):
        """Returns whether this metric can/should be evaluated at every backprop iteration or not."""
        return False  # the current PascalVOC implementation is preeetty slow with lots of bboxes


[docs]@thelper.concepts.segmentation
class IntersectionOverUnion(Metric):
    r"""Computes the intersection over union over image classes.

    It can target a single class at a time, or produce the mean IoU (mIoU) for a number of classes. It can
    also average IoU scores from each images, or sum up all intersection and union areas and compute a
    global score.

    Usage example inside a session configuration file::

        # ...
        # lists all metrics to instantiate as a dictionary
        "metrics": {
            # ...
            # this is the name of the example metric; it is used for lookup/printing only
            "mIoU": {
                # this type is used to instantiate the IoU metric
                "type": "thelper.optim.metrics.IntersectionOverUnion",
                # these parameters are passed to the wrapper's constructor
                "params": {
                    # no parameters means we will compute the mIoU with global scoring
                }
            },
            # ...
        }
        # ...

    Attributes:
        target_names: name(s) of the class(es) to target; if 'None' or list, will compute mIoU instead of IoU.
        max_win_size: maximum moving average window size to use (default=None, which equals dataset size).
        inters: array holding the intesection areas or IoU scores for all input samples.
        unions: array holding the union areas for all input samples.
    """

[docs]    def __init__(self, target_names=None, global_score=True, max_win_size=None):
        """Initializes metric attributes.

        Note that by default, if ``max_win_size`` is not provided here, the value given to ``max_iters`` on
        the first update call will be used instead to fix the sliding window length. In any case, the
        smallest of ``max_iters`` and ``max_win_size`` will be used to determine the actual window size.
        """
        assert max_win_size is None or (isinstance(max_win_size, int) and max_win_size > 0), \
            "invalid max sliding window size (should be positive integer)"
        if target_names is not None and not isinstance(target_names, (list, np.ndarray, torch.Tensor)):
            target_names = [target_names]
        self.target_names = target_names
        self.target_idxs = None  # will be updated at runtime
        self.global_score = global_score
        self.max_win_size = max_win_size
        self.inters = None  # will be instantiated on first iter
        self.unions = None  # will be instantiated on first iter
        self.task = None
        self.warned_eval_bad = False

    def __repr__(self):
        """Returns a generic print-friendly string containing info about this metric."""
        return self.__class__.__module__ + "." + self.__class__.__qualname__ + \
            f"(target_names={repr(self.target_names)}, global_score={repr(self.global_score)}, max_win_size={repr(self.max_win_size)})"

[docs]    def update(self,         # see `thelper.typedefs.IterCallbackParams` for more info
               task,         # type: thelper.tasks.utils.Task
               input,        # type: thelper.typedefs.InputType
               pred,         # type: thelper.typedefs.SegmentationPredictionType
               target,       # type: thelper.typedefs.SegmentationTargetType
               sample,       # type: thelper.typedefs.SampleType
               loss,         # type: Optional[float]
               iter_idx,     # type: int
               max_iters,    # type: int
               epoch_idx,    # type: int
               max_epochs,   # type: int
               output_path,  # type: AnyStr
               **kwargs):    # type: (...) -> None
        """Receives the latest bbox predictions and targets from the training session.

        The exact signature of this function should match the one of the callbacks defined in
        :class:`thelper.train.base.Trainer` and specified by ``thelper.typedefs.IterCallbackParams``.
        """
        assert len(kwargs) == 0, "unexpected extra arguments present in update call"
        assert iter_idx is not None and max_iters is not None and iter_idx < max_iters, \
            "bad iteration indices given to metric update function"
        curr_win_size = max_iters if self.max_win_size is None else min(self.max_win_size, max_iters)
        if self.inters is None or self.inters.size != curr_win_size:
            # each 'iteration' will have a corresponding bin with counts for that batch
            self.inters = np.asarray([None] * curr_win_size)
            self.unions = np.asarray([None] * curr_win_size)
        curr_idx = iter_idx % curr_win_size
        if task is not None:
            assert isinstance(task, thelper.tasks.Segmentation), "unexpected task type with IoU metric"
            if self.target_names is not None:
                assert all([n in task.class_names for n in self.target_names]), \
                    "missing iou target in task class names"
                self.target_idxs = [task.class_indices[n] for n in self.target_names]
            else:
                self.target_idxs = list(task.class_indices.values())
            self.task = task  # keep reference for eval only
        if target is None or len(target) == 0:
            # only accumulate results when groundtruth is available (should we though? affects false negative count)
            self.inters[curr_idx] = None
            self.unions[curr_idx] = None
            return
        assert pred.dim() == target.dim() + 1 or pred.dim() == target.dim(), \
            "prediction/gt tensors dim mismatch (should be BxCx[...] and Bx[...])"
        if pred.dim() == target.dim():
            assert target.shape[1] == 1, "unexpected channel count (>1) for target tensor"
            target = torch.squeeze(target, dim=1)
        assert pred.dim() == target.dim() + 1, "prediction/gt tensors dim mismatch (should be BxCx[...] and Bx[...])"
        assert pred.shape[0] == target.shape[0], "prediction/gt tensors batch size mismatch"
        assert pred.dim() <= 2 or pred.shape[2:] == target.shape[1:], "prediction/gt tensors array size mismatch"
        with torch.no_grad():
            pred_labels = pred.topk(1, dim=1)[1].view(pred.shape[0], -1).cpu().numpy()
            true_labels = target.view(target.shape[0], -1).cpu().numpy()
        assert self.task is not None, "task object necessary at this point since we need to refer to dontcare value"
        assert self.target_idxs, "messed up something internally..."
        if self.global_score:
            inters_count_map, union_count_map = {}, {}
            for target_idx in self.target_idxs:
                inters = np.logical_and(pred_labels == target_idx, true_labels == target_idx)
                inters_count_map[target_idx] = np.count_nonzero(inters)
                if self.task.dontcare is not None:
                    valid_preds = np.logical_and(pred_labels == target_idx, true_labels != self.task.dontcare)
                    union = np.logical_or(valid_preds, true_labels == target_idx)
                else:
                    union = np.logical_or(pred_labels == target_idx, true_labels == target_idx)
                union_count_map[target_idx] = np.count_nonzero(union)
            self.inters[curr_idx] = inters_count_map
            self.unions[curr_idx] = union_count_map
        else:
            bious = [thelper.optim.eval.compute_mask_iou(pred_labels[b], true_labels[b], self.target_idxs, self.task.dontcare)
                     for b in range(pred.shape[0])]
            self.inters[curr_idx] = {tidx: [ious[tidx] for ious in bious] for tidx in self.target_idxs}
            self.unions[curr_idx] = None

[docs]    def eval(self):
        """Returns the current IoU ratio based on the accumulated counts.

        Will issue a warning if no predictions have been accumulated yet.
        """
        assert self.inters.size == self.unions.size, "internal window size mismatch"
        if self.target_idxs is None:
            if not self.warned_eval_bad:
                self.warned_eval_bad = True
                logger.warning("iou eval result invalid (set as 0.0), no results accumulated")
            return 0.0
        accum_pairs = {}
        valid = False
        for tidx in self.target_idxs:
            accum_pairs[tidx] = [], []
            for i, u in zip(self.inters, self.unions):
                if i is not None or u is not None:
                    accum_pairs[tidx][0].append(i[tidx] if i is not None else None)
                    accum_pairs[tidx][1].append(u[tidx] if u is not None else None)
                    valid = valid or i is not None
        if not valid:
            if not self.warned_eval_bad:
                self.warned_eval_bad = True
                logger.warning("iou eval result invalid (set as 0.0), no results accumulated")
            return 0.0
        if self.global_score:
            tot_inters = {tidx: sum(accum_pairs[tidx][0]) for tidx in self.target_idxs}
            tot_union = {tidx: sum(accum_pairs[tidx][1]) for tidx in self.target_idxs}
            iou_map = {tidx: (tot_inters[tidx] / tot_union[tidx]) if tot_union[tidx] != 0 else 0.0 for tidx in self.target_idxs}
        else:
            iou_map = {}
            for tidx in self.target_idxs:
                ious = []
                for batch_ious in accum_pairs[tidx][0]:
                    for iou in batch_ious:
                        ious.append(iou)
                iou_map[tidx] = 0.0 if not ious else np.mean(ious)
        # could add per-class IoU scores to some log before averaging below...
        return np.array(list(iou_map.values())).mean()

[docs]    def reset(self):
        """Toggles a reset of the metric's internal state, deallocating bbox arrays."""
        self.inters = None
        self.unions = None

    @property
    def goal(self):
        """Returns the scalar optimization goal of this metric (maximization)."""
        return Metric.maximize
Navigation

Source code for thelper.optim.metrics

Navigation