Skip to content

metrics

aiice.metrics

Evaluator

Compute and aggregate evaluation metrics over multiple evaluation steps.

Parameters:

Name Type Description Default
metrics `dict[str, MetricFn]`, `list[str]`

Metrics to use. If a list of strings is provided, metrics are resolved from the built-in registry. If None, default metrics are used.

None
accumulate `bool`

Whether to accumulate metric values across multiple eval calls. Defaults to True.

True
Source code in src/aiice/metrics.py
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
class Evaluator:
    """
    Compute and aggregate evaluation metrics over multiple evaluation steps.

    Args:
        metrics (`dict[str, MetricFn]`, `list[str]`, optional):
            Metrics to use. If a list of strings is provided, metrics are resolved
            from the built-in registry. If None, default metrics are used.
        accumulate (`bool`, optional):
            Whether to accumulate metric values across multiple `eval` calls. Defaults to True.
    """

    _metrics_registry: dict[str, MetricFn] = {
        MAE_METRIC: mae,
        MSE_METRIC: mse,
        RMSE_METRIC: rmse,
        PSNR_METRIC: psnr,
        BIN_ACCURACY_METRIC: bin_accuracy,
        SSIM_METRIC: ssim,
        IOU_METRIC: iou,
    }

    def __init__(
        self,
        metrics: dict[str, MetricFn] | list[str] | None = None,
        accumulate: bool = True,
    ):
        if metrics is None:
            self._metrics = self._metrics_registry
        elif isinstance(metrics, list):
            self._metrics = self._init_metrics(metrics)
        else:
            self._metrics = metrics

        self._accumulate = accumulate
        self._report: dict[str, list[float]] = {k: [] for k in self._metrics}

    def _init_metrics(self, metrics: list[str]) -> dict[str, MetricFn]:
        result: dict[str, MetricFn] = {}
        for name in metrics:
            try:
                result[name] = self._metrics_registry[name]
            except KeyError:
                raise ValueError(
                    f"Unknown metric '{name}', choose from {list(self._metrics_registry.keys())}"
                )
        return result

    @property
    def metrics(self) -> list[str]:
        return list(self._metrics.keys())

    def eval(self, y_true: Sequence, y_pred: Sequence) -> dict[str, float]:
        """
        Evaluate all metrics on a single batch or sample and updates the internal
        report state depending on the ``accumulate`` mode.
        """
        step_result: dict[str, float] = {}

        for name, fn in self._metrics.items():
            value = fn(y_true, y_pred)
            step_result[name] = value

            if self._accumulate:
                self._report[name].append(value)
            else:
                self._report[name] = [value]

        return step_result

    def report(self, detailed: bool = True) -> dict[str, dict[str, float] | float]:
        """
        Return aggregated statistics for all evaluated metrics.

        Args:
            detailed (`bool`, optional):
                If True, returns full statistics for each metric including:
                mean, last value, count, min, and max.
                If False, returns only the mean value per metric.
        """
        summary: dict[str, dict[str, float] | float] = {}
        for name, values in self._report.items():
            if not values:
                continue

            # Filter out nan and inf before aggregation — these indicate samples
            # where the metric is undefined (e.g. PSNR on an all-zero ground truth).
            # Raw values are still preserved in _report for debugging.
            clean = [v for v in values if not math.isnan(v) and not math.isinf(v)]

            if detailed:
                summary[name] = {
                    MEAN_STAT: sum(clean) / len(clean) if clean else float("nan"),
                    LAST_STAT: values[-1],
                    # COUNT_STAT reflects total samples evaluated, including skipped ones,
                    # so you can detect how many were undefined (len(values) - len(clean))
                    COUNT_STAT: len(values),
                    MIN_STAT: min(clean) if clean else float("nan"),
                    MAX_STAT: max(clean) if clean else float("nan"),
                }
            else:
                summary[name] = sum(clean) / len(clean) if clean else float("nan")

        return summary

eval

eval(y_true: Sequence, y_pred: Sequence) -> dict[str, float]

Evaluate all metrics on a single batch or sample and updates the internal report state depending on the accumulate mode.

Source code in src/aiice/metrics.py
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
def eval(self, y_true: Sequence, y_pred: Sequence) -> dict[str, float]:
    """
    Evaluate all metrics on a single batch or sample and updates the internal
    report state depending on the ``accumulate`` mode.
    """
    step_result: dict[str, float] = {}

    for name, fn in self._metrics.items():
        value = fn(y_true, y_pred)
        step_result[name] = value

        if self._accumulate:
            self._report[name].append(value)
        else:
            self._report[name] = [value]

    return step_result

report

report(detailed: bool = True) -> dict[str, dict[str, float] | float]

Return aggregated statistics for all evaluated metrics.

Parameters:

Name Type Description Default
detailed `bool`

If True, returns full statistics for each metric including: mean, last value, count, min, and max. If False, returns only the mean value per metric.

True
Source code in src/aiice/metrics.py
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
def report(self, detailed: bool = True) -> dict[str, dict[str, float] | float]:
    """
    Return aggregated statistics for all evaluated metrics.

    Args:
        detailed (`bool`, optional):
            If True, returns full statistics for each metric including:
            mean, last value, count, min, and max.
            If False, returns only the mean value per metric.
    """
    summary: dict[str, dict[str, float] | float] = {}
    for name, values in self._report.items():
        if not values:
            continue

        # Filter out nan and inf before aggregation — these indicate samples
        # where the metric is undefined (e.g. PSNR on an all-zero ground truth).
        # Raw values are still preserved in _report for debugging.
        clean = [v for v in values if not math.isnan(v) and not math.isinf(v)]

        if detailed:
            summary[name] = {
                MEAN_STAT: sum(clean) / len(clean) if clean else float("nan"),
                LAST_STAT: values[-1],
                # COUNT_STAT reflects total samples evaluated, including skipped ones,
                # so you can detect how many were undefined (len(values) - len(clean))
                COUNT_STAT: len(values),
                MIN_STAT: min(clean) if clean else float("nan"),
                MAX_STAT: max(clean) if clean else float("nan"),
            }
        else:
            summary[name] = sum(clean) / len(clean) if clean else float("nan")

    return summary

mae

mae(y_true: Sequence, y_pred: Sequence) -> float

MAE (mean absolute error) - determines absolute values range coincidence with real data.

\[\text{MAE} = \frac{1}{N} \sum_{i=1}^{N} |y_i - \hat{y}_i|\]
Source code in src/aiice/metrics.py
32
33
34
35
36
37
38
39
def mae(y_true: Sequence, y_pred: Sequence) -> float:
    """
    MAE (mean absolute error) - determines absolute values range coincidence with real data.

    $$\\text{MAE} = \\frac{1}{N} \\sum_{i=1}^{N} |y_i - \\hat{y}_i|$$
    """
    y_true, y_pred = _as_tensor(y_true, y_pred)
    return torch.abs(y_true - y_pred).mean().item()

mse

mse(y_true: Sequence, y_pred: Sequence) -> float

MSE (mean squared error) - similar to MAE but emphasizes larger errors by squaring differences.

\[\text{MSE} = \frac{1}{N} \sum_{i=1}^{N} (y_i - \hat{y}_i)^2\]
Source code in src/aiice/metrics.py
42
43
44
45
46
47
48
49
def mse(y_true: Sequence, y_pred: Sequence) -> float:
    """
    MSE (mean squared error) - similar to MAE but emphasizes larger errors by squaring differences.

    $$\\text{MSE} = \\frac{1}{N} \\sum_{i=1}^{N} (y_i - \\hat{y}_i)^2$$
    """
    y_true, y_pred = _as_tensor(y_true, y_pred)
    return ((y_true - y_pred) ** 2).mean().item()

rmse

rmse(y_true: Sequence, y_pred: Sequence) -> float

RMSE (root mean square error) - determines absolute values range coincidence as MAE but making emphasis on spatial error distribution of prediction.

\[\text{RMSE} = \sqrt{\frac{1}{N} \sum_{i=1}^{N} (y_i - \hat{y}_i)^2}\]
Source code in src/aiice/metrics.py
52
53
54
55
56
57
58
59
60
def rmse(y_true: Sequence, y_pred: Sequence) -> float:
    """
    RMSE (root mean square error) - determines absolute values range coincidence as MAE
    but making emphasis on spatial error distribution of prediction.

    $$\\text{RMSE} = \\sqrt{\\frac{1}{N} \\sum_{i=1}^{N} (y_i - \\hat{y}_i)^2}$$
    """
    y_true, y_pred = _as_tensor(y_true, y_pred)
    return torch.sqrt(((y_true - y_pred) ** 2).mean()).item()

psnr

psnr(y_true: Sequence, y_pred: Sequence) -> float

PSNR (peak signal-to-noise ratio) - reflects noise and distortion level on predicted images identifying artifacts.

\[\text{PSNR} = 20 \cdot \log_{10}(\text{MAX}) - 10 \cdot \log_{10}(\text{MSE})\]

where \(\text{MAX}\) is the maximum value of the ground truth field.

Source code in src/aiice/metrics.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def psnr(y_true: Sequence, y_pred: Sequence) -> float:
    """
    PSNR (peak signal-to-noise ratio) - reflects noise and distortion level on predicted images identifying artifacts.

    $$\\text{PSNR} = 20 \\cdot \\log_{10}(\\text{MAX}) - 10 \\cdot \\log_{10}(\\text{MSE})$$

    where $\\text{MAX}$ is the maximum value of the ground truth field.
    """
    y_true, y_pred = _as_tensor(y_true, y_pred)

    mse_val = torch.mean((y_true - y_pred) ** 2)
    if mse_val == 0:
        return float("inf")

    # MAX = 0 means the ground truth field is entirely ice-free.
    # PSNR is undefined in this case — return nan so the Evaluator
    # can exclude this sample from aggregation rather than corrupting the mean.
    if torch.max(y_true) == 0:
        return float("nan")

    max_val = torch.max(y_true)
    return (20 * torch.log10(max_val) - 10 * torch.log10(mse_val)).item()

bin_accuracy

bin_accuracy(y_true: Sequence, y_pred: Sequence, threshold: float = 0.15) -> float

Binary accuracy - binarization of ice concentration continuous field with threshold which causing the presence of an ice edge gives us possibility to compare binary masks of real ice extent and predicted one.

\[\text{BinAcc} = \frac{1}{N} \sum_{i=1}^{N} \mathbf{1}\bigl[\hat{b}_i = b_i\bigr]\]

where \(b_i = \mathbf{1}[y_i > \tau]\) and \(\hat{b}_i = \mathbf{1}[\hat{y}_i > \tau]\) are binary masks obtained by thresholding with \(\tau\).

Source code in src/aiice/metrics.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def bin_accuracy(y_true: Sequence, y_pred: Sequence, threshold: float = 0.15) -> float:
    """
    Binary accuracy - binarization of ice concentration continuous field with threshold which causing the presence of an ice edge
    gives us possibility to compare binary masks of real ice extent and predicted one.

    $$\\text{BinAcc} = \\frac{1}{N} \\sum_{i=1}^{N} \\mathbf{1}\\bigl[\\hat{b}_i = b_i\\bigr]$$

    where $b_i = \\mathbf{1}[y_i > \\tau]$ and $\\hat{b}_i = \\mathbf{1}[\\hat{y}_i > \\tau]$ are binary masks
    obtained by thresholding with $\\tau$.
    """
    y_true, y_pred = _as_tensor(y_true, y_pred)

    y_true = apply_threshold(y_true, threshold)
    y_pred = apply_threshold(y_pred, threshold)

    return (y_true == y_pred).float().mean().item()

ssim

ssim(y_true: Sequence, y_pred: Sequence) -> float

SSIM (structural similarity index measure) - determines spatial patterns coincidence on predicted and target images

\[\text{SSIM}(x, y) = \frac{(2\mu_x\mu_y + c_1)(2\sigma_{xy} + c_2)}{(\mu_x^2 + \mu_y^2 + c_1)(\sigma_x^2 + \sigma_y^2 + c_2)}\]

where \(\mu_x, \mu_y\) are local means, \(\sigma_x^2, \sigma_y^2\) are local variances, \(\sigma_{xy}\) is cross-covariance, and \(c_1, c_2\) are stabilization constants.

Raises:

Type Description
ValueError
  • If input tensors are not 4D ([N, C, H, W]) or 5D ([N, C, D, H, W]).
  • If any spatial or temporal dimension is smaller than 11 (minimum SSIM kernel window size)
Source code in src/aiice/metrics.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def ssim(y_true: Sequence, y_pred: Sequence) -> float:
    """
    SSIM (structural similarity index measure) - determines spatial patterns coincidence on predicted and target images

    $$\\text{SSIM}(x, y) = \\frac{(2\\mu_x\\mu_y + c_1)(2\\sigma_{xy} + c_2)}{(\\mu_x^2 + \\mu_y^2 + c_1)(\\sigma_x^2 + \\sigma_y^2 + c_2)}$$

    where $\\mu_x, \\mu_y$ are local means, $\\sigma_x^2, \\sigma_y^2$ are local variances,
    $\\sigma_{xy}$ is cross-covariance, and $c_1, c_2$ are stabilization constants.

    Raises:
        ValueError:
            - If input tensors are not 4D ([N, C, H, W]) or 5D ([N, C, D, H, W]).
            - If any spatial or temporal dimension is smaller than 11 (minimum SSIM kernel window size)
    """
    spatial_dims = y_true.shape[2:]
    if any(dim < DEFAULT_SSIM_KERNEL_WINDOW_SIZE for dim in spatial_dims):
        raise ValueError(
            f"All spatial dimensions {spatial_dims} must be >= win_size={DEFAULT_SSIM_KERNEL_WINDOW_SIZE}"
        )

    y_true, y_pred = _as_tensor(y_true, y_pred)
    return float(pytorch_msssim.ssim(y_true, y_pred, data_range=1.0))

iou

iou(y_true: Sequence, y_pred: Sequence, threshold: float = 0.15) -> float

IoU (Intersection over Union) - measures overlap between binary masks of ground truth and prediction.

Similar to bin_accuracy but focuses on overlap quality instead of per-pixel equality.

\[\text{IoU} = \frac{|B \cap \hat{B}|}{|B \cup \hat{B}|} = \frac{|B \cap \hat{B}|}{|B| + |\hat{B}| - |B \cap \hat{B}|}\]

where \(B = \mathbf{1}[y > \tau]\) and \(\hat{B} = \mathbf{1}[\hat{y} > \tau]\) are binary ice extent masks.

Source code in src/aiice/metrics.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
def iou(y_true: Sequence, y_pred: Sequence, threshold: float = 0.15) -> float:
    """
    IoU (Intersection over Union) - measures overlap between binary masks
    of ground truth and prediction.

    Similar to bin_accuracy but focuses on overlap quality instead of per-pixel equality.

    $$\\text{IoU} = \\frac{|B \\cap \\hat{B}|}{|B \\cup \\hat{B}|} = \\frac{|B \\cap \\hat{B}|}{|B| + |\\hat{B}| - |B \\cap \\hat{B}|}$$

    where $B = \\mathbf{1}[y > \\tau]$ and $\\hat{B} = \\mathbf{1}[\\hat{y} > \\tau]$ are binary ice extent masks.
    """
    y_true, y_pred = _as_tensor(y_true, y_pred)

    y_true = apply_threshold(y_true, threshold)
    y_pred = apply_threshold(y_pred, threshold)

    y_true = y_true.view(y_true.size(0), -1)
    y_pred = y_pred.view(y_pred.size(0), -1)

    intersection = (y_true * y_pred).sum(dim=1)
    # union = |A| + |B| - |A ∩ B|
    union = y_true.sum(dim=1) + y_pred.sum(dim=1) - intersection

    eps = 1e-7
    iou = intersection / (union + eps)
    return iou.mean().item()