Skip to content

preprocess

aiice.preprocess

SlidingWindowDataset

Bases: Dataset

Convert a time series into (X, Y) pairs using sliding windows.

X represents past observations of length pre_history_len, Y represents future observations of length forecast_len.

image

The dataset is generated lazily: windows are sliced on demand from the original tensor without materializing the full dataset in memory. The time dimension is assumed to be the first axis of the input tensor.

Parameters:

Name Type Description Default
data `Sequence`

Time series data of shape [T, ...] where T is the time dimension and remaining dimensions represent features or channels.

required
pre_history_len `int`

Number of time steps in each input window (X).

required
forecast_len `int`

Number of time steps in each output window (Y).

required
idx `Sequence`

Optional sequence of any indeces corresponding to each time step in data. Must have the same length as the time dimension T. If provided, __getitem__ returns a tuple (id, X, Y) containing the corresponding timestamps for the selected window, otherwise it returns only (X, Y).

None
threshold `float`

If provided, binarizes the target tensor Y using this threshold. Values strictly greater than the threshold are set to 1, and values less than or equal to the threshold are set to 0. Defaults to None.

None
x_binarize `bool`

If True and threshold is provided, applies the same binarization to the input tensor X. Defaults to False.

False
device `str`

Device on which to place the tensors (e.g., "cpu", "cuda"). Defaults to None.

None
dtype dtype

Data type used to convert the input sequence. Defaults to torch.float32.

float32
Source code in src/aiice/preprocess.py
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
class SlidingWindowDataset(Dataset):
    """
    Convert a time series into (X, Y) pairs using sliding windows.

    X represents past observations of length `pre_history_len`,
    Y represents future observations of length `forecast_len`.

    ![image](../media/sliding-window.png)

    The dataset is generated lazily: windows are sliced on demand from the
    original tensor without materializing the full dataset in memory.
    The time dimension is assumed to be the first axis of the input tensor.

    Args:
        data (`Sequence`): Time series data of shape `[T, ...]` where `T` is the time dimension
            and remaining dimensions represent features or channels.
        pre_history_len (`int`): Number of time steps in each input window (X).
        forecast_len (`int`): Number of time steps in each output window (Y).
        idx (`Sequence`, optional): Optional sequence of any indeces corresponding
            to each time step in `data`. Must have the same length as the time dimension `T`.
            If provided, `__getitem__` returns a tuple `(id, X, Y)` containing the
            corresponding timestamps for the selected window, otherwise it returns only `(X, Y)`.
        threshold (`float`, optional): If provided, binarizes the target tensor Y using this threshold.
            Values strictly greater than the threshold are set to 1, and values less than or equal to
            the threshold are set to 0. Defaults to None.
        x_binarize (`bool`, optional): If True and `threshold` is provided, applies the same binarization
            to the input tensor X. Defaults to False.
        device (`str`, optional): Device on which to place the tensors (e.g., "cpu", "cuda"). Defaults to None.
        dtype (torch.dtype, optional): Data type used to convert the input sequence. Defaults to torch.float32.
    """

    def __init__(
        self,
        data: Sequence,
        pre_history_len: int,
        forecast_len: int,
        idx: Sequence | None = None,
        threshold: float | None = None,
        x_binarize: bool = False,
        device: str | None = None,
        dtype: torch.dtype = torch.float32,
    ):
        self._data = torch.as_tensor(data, dtype=dtype, device=device)
        self._indices = idx

        self._threshold = threshold
        self._x_binarize = x_binarize

        if self._data.ndim == 1:
            self._data = self._data.unsqueeze(-1)  # [T] -> [T, 1]

        self._pre_history_len = pre_history_len
        self._forecast_len = forecast_len

        self._T = self._data.shape[0]
        if self._indices is not None and self._T != len(self._indices):
            raise ValueError(
                f"Data length (got {self._T}) should be equal to indices length (got {len(self._indices)})"
            )

        self._length = self._T - pre_history_len - forecast_len + 1

        if self._length <= 0:
            raise ValueError(
                f"Not enough data: got {self._T}, need at least {pre_history_len + forecast_len}"
            )

    def __len__(self):
        return self._length

    def __getitem__(self, idx: int):
        if not isinstance(idx, int):
            raise TypeError("index must be int")

        if idx < 0 or idx >= self._length:
            raise IndexError("index out of range")

        x = self._data[idx : idx + self._pre_history_len]
        y = self._data[
            idx
            + self._pre_history_len : idx
            + self._pre_history_len
            + self._forecast_len
        ]

        if isinstance(self._threshold, float):
            y = apply_threshold(y, self._threshold)
            x = apply_threshold(x, self._threshold) if self._x_binarize else x

        if self._indices is not None:
            idx_slice = self._indices[
                idx : idx + self._pre_history_len + self._forecast_len
            ]
            return idx_slice, x, y

        return x, y

apply_threshold

apply_threshold(tensor: Tensor, threshold: float = 0.5) -> torch.Tensor

Binarize tensor with a threshold

Source code in src/aiice/preprocess.py
7
8
9
def apply_threshold(tensor: torch.Tensor, threshold: float = 0.5) -> torch.Tensor:
    "Binarize tensor with a threshold"
    return (tensor > threshold).to(tensor.dtype)

apply_downsample

apply_downsample(t: Tensor, i: int, axes: tuple[int, ...] = (-1,)) -> torch.Tensor

Downsample a tensor by keeping every i-th element along specified axes.

Parameters:

Name Type Description Default
t `torch.Tensor`

Input tensor.

required
i `int`

Step for downsampling. Must be greater than 0.

required
axes `tuple[int]`

Axes along which to downsample. Negative axes are supported.

(-1,)
Source code in src/aiice/preprocess.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
def apply_downsample(
    t: torch.Tensor, i: int, axes: tuple[int, ...] = (-1,)
) -> torch.Tensor:
    """
    Downsample a tensor by keeping every i-th element along specified axes.

    Args:
        t (`torch.Tensor`): Input tensor.
        i (`int`): Step for downsampling. Must be greater than 0.
        axes (`tuple[int]`): Axes along which to downsample. Negative axes are supported.
    """
    if i <= 0:
        raise ValueError("i must be > 0")

    out = t
    for axis in axes:
        axis = axis if axis >= 0 else t.dim() + axis

        idx = torch.arange(out.shape[axis], device=out.device)
        keep = idx % i == 0
        out = torch.index_select(out, axis, idx[keep])

    return out