metrics

aiice.metrics

Evaluator

Compute and aggregate evaluation metrics over multiple evaluation steps.

Parameters:

Name	Type	Description	Default
`metrics`	`dict[str, MetricFn]`, `list[str]`	Metrics to use. If a list of strings is provided, metrics are resolved from the built-in registry. If None, default metrics are used.	`None`
`accumulate`	`bool`	Whether to accumulate metric values across multiple `eval` calls. Defaults to True.	`True`

Source code in src/aiice/metrics.py

class Evaluator:
    """
    Compute and aggregate evaluation metrics over multiple evaluation steps.

    Args:
        metrics (`dict[str, MetricFn]`, `list[str]`, optional):
            Metrics to use. If a list of strings is provided, metrics are resolved
            from the built-in registry. If None, default metrics are used.
        accumulate (`bool`, optional):
            Whether to accumulate metric values across multiple `eval` calls. Defaults to True.
    """

    _metrics_registry: dict[str, MetricFn] = {
        MAE_METRIC: mae,
        MSE_METRIC: mse,
        RMSE_METRIC: rmse,
        PSNR_METRIC: psnr,
        BIN_ACCURACY_METRIC: bin_accuracy,
        SSIM_METRIC: ssim,
        IOU_METRIC: iou,
    }

    def __init__(
        self,
        metrics: dict[str, MetricFn] | list[str] | None = None,
        accumulate: bool = True,
    ):
        if metrics is None:
            self._metrics = self._metrics_registry
        elif isinstance(metrics, list):
            self._metrics = self._init_metrics(metrics)
        else:
            self._metrics = metrics

        self._accumulate = accumulate
        self._report: dict[str, list[float]] = {k: [] for k in self._metrics}

    def _init_metrics(self, metrics: list[str]) -> dict[str, MetricFn]:
        result: dict[str, MetricFn] = {}
        for name in metrics:
            try:
                result[name] = self._metrics_registry[name]
            except KeyError:
                raise ValueError(
                    f"Unknown metric '{name}', choose from {list(self._metrics_registry.keys())}"
                )
        return result

    @property
    def metrics(self) -> list[str]:
        return list(self._metrics.keys())

    def eval(self, y_true: Sequence, y_pred: Sequence) -> dict[str, float]:
        """
        Evaluate all metrics on a single batch or sample and updates the internal
        report state depending on the ``accumulate`` mode.
        """
        step_result: dict[str, float] = {}

        for name, fn in self._metrics.items():
            value = fn(y_true, y_pred)
            step_result[name] = value

            if self._accumulate:
                self._report[name].append(value)
            else:
                self._report[name] = [value]

        return step_result

    def report(self, detailed: bool = True) -> dict[str, dict[str, float] | float]:
        """
        Return aggregated statistics for all evaluated metrics.

        Args:
            detailed (`bool`, optional):
                If True, returns full statistics for each metric including:
                mean, last value, count, min, and max.
                If False, returns only the mean value per metric.
        """
        summary: dict[str, dict[str, float] | float] = {}
        for name, values in self._report.items():
            if not values:
                continue

            # Filter out nan and inf before aggregation — these indicate samples
            # where the metric is undefined (e.g. PSNR on an all-zero ground truth).
            # Raw values are still preserved in _report for debugging.
            clean = [v for v in values if not math.isnan(v) and not math.isinf(v)]

            if detailed:
                summary[name] = {
                    MEAN_STAT: sum(clean) / len(clean) if clean else float("nan"),
                    LAST_STAT: values[-1],
                    # COUNT_STAT reflects total samples evaluated, including skipped ones,
                    # so you can detect how many were undefined (len(values) - len(clean))
                    COUNT_STAT: len(values),
                    MIN_STAT: min(clean) if clean else float("nan"),
                    MAX_STAT: max(clean) if clean else float("nan"),
                }
            else:
                summary[name] = sum(clean) / len(clean) if clean else float("nan")

        return summary

eval

eval(y_true: Sequence, y_pred: Sequence) -> dict[str, float]

Evaluate all metrics on a single batch or sample and updates the internal report state depending on the accumulate mode.

Source code in src/aiice/metrics.py

def eval(self, y_true: Sequence, y_pred: Sequence) -> dict[str, float]:
    """
    Evaluate all metrics on a single batch or sample and updates the internal
    report state depending on the ``accumulate`` mode.
    """
    step_result: dict[str, float] = {}

    for name, fn in self._metrics.items():
        value = fn(y_true, y_pred)
        step_result[name] = value

        if self._accumulate:
            self._report[name].append(value)
        else:
            self._report[name] = [value]

    return step_result

report

report(detailed: bool = True) -> dict[str, dict[str, float] | float]

Return aggregated statistics for all evaluated metrics.

Parameters:

Name	Type	Description	Default
`detailed`	`bool`	If True, returns full statistics for each metric including: mean, last value, count, min, and max. If False, returns only the mean value per metric.	`True`

Source code in src/aiice/metrics.py

def report(self, detailed: bool = True) -> dict[str, dict[str, float] | float]:
    """
    Return aggregated statistics for all evaluated metrics.

    Args:
        detailed (`bool`, optional):
            If True, returns full statistics for each metric including:
            mean, last value, count, min, and max.
            If False, returns only the mean value per metric.
    """
    summary: dict[str, dict[str, float] | float] = {}
    for name, values in self._report.items():
        if not values:
            continue

        # Filter out nan and inf before aggregation — these indicate samples
        # where the metric is undefined (e.g. PSNR on an all-zero ground truth).
        # Raw values are still preserved in _report for debugging.
        clean = [v for v in values if not math.isnan(v) and not math.isinf(v)]

        if detailed:
            summary[name] = {
                MEAN_STAT: sum(clean) / len(clean) if clean else float("nan"),
                LAST_STAT: values[-1],
                # COUNT_STAT reflects total samples evaluated, including skipped ones,
                # so you can detect how many were undefined (len(values) - len(clean))
                COUNT_STAT: len(values),
                MIN_STAT: min(clean) if clean else float("nan"),
                MAX_STAT: max(clean) if clean else float("nan"),
            }
        else:
            summary[name] = sum(clean) / len(clean) if clean else float("nan")

    return summary

mae

mae(y_true: Sequence, y_pred: Sequence) -> float

MAE (mean absolute error) - determines absolute values range coincidence with real data.

\[\text{MAE} = \frac{1}{N} \sum_{i=1}^{N} |y_i - \hat{y}_i|\]

Source code in src/aiice/metrics.py

def mae(y_true: Sequence, y_pred: Sequence) -> float:
    """
    MAE (mean absolute error) - determines absolute values range coincidence with real data.

    $$\\text{MAE} = \\frac{1}{N} \\sum_{i=1}^{N} |y_i - \\hat{y}_i|$$
    """
    y_true, y_pred = _as_tensor(y_true, y_pred)
    return torch.abs(y_true - y_pred).mean().item()

mse

mse(y_true: Sequence, y_pred: Sequence) -> float

MSE (mean squared error) - similar to MAE but emphasizes larger errors by squaring differences.

\[\text{MSE} = \frac{1}{N} \sum_{i=1}^{N} (y_i - \hat{y}_i)^2\]

Source code in src/aiice/metrics.py

def mse(y_true: Sequence, y_pred: Sequence) -> float:
    """
    MSE (mean squared error) - similar to MAE but emphasizes larger errors by squaring differences.

    $$\\text{MSE} = \\frac{1}{N} \\sum_{i=1}^{N} (y_i - \\hat{y}_i)^2$$
    """
    y_true, y_pred = _as_tensor(y_true, y_pred)
    return ((y_true - y_pred) ** 2).mean().item()

rmse

rmse(y_true: Sequence, y_pred: Sequence) -> float

RMSE (root mean square error) - determines absolute values range coincidence as MAE but making emphasis on spatial error distribution of prediction.

\[\text{RMSE} = \sqrt{\frac{1}{N} \sum_{i=1}^{N} (y_i - \hat{y}_i)^2}\]

Source code in src/aiice/metrics.py

def rmse(y_true: Sequence, y_pred: Sequence) -> float:
    """
    RMSE (root mean square error) - determines absolute values range coincidence as MAE
    but making emphasis on spatial error distribution of prediction.

    $$\\text{RMSE} = \\sqrt{\\frac{1}{N} \\sum_{i=1}^{N} (y_i - \\hat{y}_i)^2}$$
    """
    y_true, y_pred = _as_tensor(y_true, y_pred)
    return torch.sqrt(((y_true - y_pred) ** 2).mean()).item()

psnr

psnr(y_true: Sequence, y_pred: Sequence) -> float

PSNR (peak signal-to-noise ratio) - reflects noise and distortion level on predicted images identifying artifacts.

\[\text{PSNR} = 20 \cdot \log_{10}(\text{MAX}) - 10 \cdot \log_{10}(\text{MSE})\]

where \(\text{MAX}\) is the maximum value of the ground truth field.

Source code in src/aiice/metrics.py

def psnr(y_true: Sequence, y_pred: Sequence) -> float:
    """
    PSNR (peak signal-to-noise ratio) - reflects noise and distortion level on predicted images identifying artifacts.

    $$\\text{PSNR} = 20 \\cdot \\log_{10}(\\text{MAX}) - 10 \\cdot \\log_{10}(\\text{MSE})$$

    where $\\text{MAX}$ is the maximum value of the ground truth field.
    """
    y_true, y_pred = _as_tensor(y_true, y_pred)

    mse_val = torch.mean((y_true - y_pred) ** 2)
    if mse_val == 0:
        return float("inf")

    # MAX = 0 means the ground truth field is entirely ice-free.
    # PSNR is undefined in this case — return nan so the Evaluator
    # can exclude this sample from aggregation rather than corrupting the mean.
    if torch.max(y_true) == 0:
        return float("nan")

    max_val = torch.max(y_true)
    return (20 * torch.log10(max_val) - 10 * torch.log10(mse_val)).item()

bin_accuracy

bin_accuracy(y_true: Sequence, y_pred: Sequence, threshold: float = 0.15) -> float

Binary accuracy - binarization of ice concentration continuous field with threshold which causing the presence of an ice edge gives us possibility to compare binary masks of real ice extent and predicted one.

\[\text{BinAcc} = \frac{1}{N} \sum_{i=1}^{N} \mathbf{1}\bigl[\hat{b}_i = b_i\bigr]\]

where \(b_i = \mathbf{1}[y_i > \tau]\) and \(\hat{b}_i = \mathbf{1}[\hat{y}_i > \tau]\) are binary masks obtained by thresholding with \(\tau\).

Source code in src/aiice/metrics.py

def bin_accuracy(y_true: Sequence, y_pred: Sequence, threshold: float = 0.15) -> float:
    """
    Binary accuracy - binarization of ice concentration continuous field with threshold which causing the presence of an ice edge
    gives us possibility to compare binary masks of real ice extent and predicted one.

    $$\\text{BinAcc} = \\frac{1}{N} \\sum_{i=1}^{N} \\mathbf{1}\\bigl[\\hat{b}_i = b_i\\bigr]$$

    where $b_i = \\mathbf{1}[y_i > \\tau]$ and $\\hat{b}_i = \\mathbf{1}[\\hat{y}_i > \\tau]$ are binary masks
    obtained by thresholding with $\\tau$.
    """
    y_true, y_pred = _as_tensor(y_true, y_pred)

    y_true = apply_threshold(y_true, threshold)
    y_pred = apply_threshold(y_pred, threshold)

    return (y_true == y_pred).float().mean().item()

ssim

ssim(y_true: Sequence, y_pred: Sequence) -> float

SSIM (structural similarity index measure) - determines spatial patterns coincidence on predicted and target images

\[\text{SSIM}(x, y) = \frac{(2\mu_x\mu_y + c_1)(2\sigma_{xy} + c_2)}{(\mu_x^2 + \mu_y^2 + c_1)(\sigma_x^2 + \sigma_y^2 + c_2)}\]

where \(\mu_x, \mu_y\) are local means, \(\sigma_x^2, \sigma_y^2\) are local variances, \(\sigma_{xy}\) is cross-covariance, and \(c_1, c_2\) are stabilization constants.

Raises:

Type	Description
`ValueError`	If input tensors are not 4D ([N, C, H, W]) or 5D ([N, C, D, H, W]). If any spatial or temporal dimension is smaller than 11 (minimum SSIM kernel window size)

Source code in src/aiice/metrics.py

def ssim(y_true: Sequence, y_pred: Sequence) -> float:
    """
    SSIM (structural similarity index measure) - determines spatial patterns coincidence on predicted and target images

    $$\\text{SSIM}(x, y) = \\frac{(2\\mu_x\\mu_y + c_1)(2\\sigma_{xy} + c_2)}{(\\mu_x^2 + \\mu_y^2 + c_1)(\\sigma_x^2 + \\sigma_y^2 + c_2)}$$

    where $\\mu_x, \\mu_y$ are local means, $\\sigma_x^2, \\sigma_y^2$ are local variances,
    $\\sigma_{xy}$ is cross-covariance, and $c_1, c_2$ are stabilization constants.

    Raises:
        ValueError:
            - If input tensors are not 4D ([N, C, H, W]) or 5D ([N, C, D, H, W]).
            - If any spatial or temporal dimension is smaller than 11 (minimum SSIM kernel window size)
    """
    spatial_dims = y_true.shape[2:]
    if any(dim < DEFAULT_SSIM_KERNEL_WINDOW_SIZE for dim in spatial_dims):
        raise ValueError(
            f"All spatial dimensions {spatial_dims} must be >= win_size={DEFAULT_SSIM_KERNEL_WINDOW_SIZE}"
        )

    y_true, y_pred = _as_tensor(y_true, y_pred)
    return float(pytorch_msssim.ssim(y_true, y_pred, data_range=1.0))

iou

iou(y_true: Sequence, y_pred: Sequence, threshold: float = 0.15) -> float

IoU (Intersection over Union) - measures overlap between binary masks of ground truth and prediction.

Similar to bin_accuracy but focuses on overlap quality instead of per-pixel equality.

\[\text{IoU} = \frac{|B \cap \hat{B}|}{|B \cup \hat{B}|} = \frac{|B \cap \hat{B}|}{|B| + |\hat{B}| - |B \cap \hat{B}|}\]

where \(B = \mathbf{1}[y > \tau]\) and \(\hat{B} = \mathbf{1}[\hat{y} > \tau]\) are binary ice extent masks.

Source code in src/aiice/metrics.py

def iou(y_true: Sequence, y_pred: Sequence, threshold: float = 0.15) -> float:
    """
    IoU (Intersection over Union) - measures overlap between binary masks
    of ground truth and prediction.

    Similar to bin_accuracy but focuses on overlap quality instead of per-pixel equality.

    $$\\text{IoU} = \\frac{|B \\cap \\hat{B}|}{|B \\cup \\hat{B}|} = \\frac{|B \\cap \\hat{B}|}{|B| + |\\hat{B}| - |B \\cap \\hat{B}|}$$

    where $B = \\mathbf{1}[y > \\tau]$ and $\\hat{B} = \\mathbf{1}[\\hat{y} > \\tau]$ are binary ice extent masks.
    """
    y_true, y_pred = _as_tensor(y_true, y_pred)

    y_true = apply_threshold(y_true, threshold)
    y_pred = apply_threshold(y_pred, threshold)

    y_true = y_true.view(y_true.size(0), -1)
    y_pred = y_pred.view(y_pred.size(0), -1)

    intersection = (y_true * y_pred).sum(dim=1)
    # union = |A| + |B| - |A ∩ B|
    union = y_true.sum(dim=1) + y_pred.sum(dim=1) - intersection

    eps = 1e-7
    iou = intersection / (union + eps)
    return iou.mean().item()