"""Time series cross-validation splitters."""
import logging
from typing import Any, Iterator, Tuple
import pandas as pd
logger = logging.getLogger(__name__)
[docs]
class ExpandingWindowSplit:
"""Expanding window cross-validation splitter.
Each fold uses all data up to the cutoff point for training,
and tests on the next window.
Attributes:
initial_window: Initial training window size.
step_size: Step size between folds.
fh: Forecast horizon for each fold.
"""
[docs]
def __init__(
self,
initial_window: int,
step_size: int = 1,
fh: int = 1,
):
"""Initialize expanding window splitter.
Args:
initial_window: Initial training window size.
step_size: Step size between folds.
fh: Forecast horizon for each fold.
"""
self.initial_window = initial_window
self.step_size = step_size
self.fh = fh
[docs]
def split(self, y: Any) -> Iterator[Tuple[Any, Any, Any]]:
"""Generate train/test splits.
Args:
y: Time series data (Series or DataFrame with time index).
Yields:
Tuples of (train_indices, test_indices, cutoff).
"""
if isinstance(y, pd.Series):
n = len(y)
elif isinstance(y, pd.DataFrame):
n = len(y)
else:
raise TypeError(f"y must be Series or DataFrame, got {type(y).__name__}")
if n < self.initial_window + self.fh:
raise ValueError(
f"Data length ({n}) must be >= initial_window ({self.initial_window}) "
f"+ fh ({self.fh})"
)
cutoff = self.initial_window
fold_id = 0
while cutoff + self.fh <= n:
train_end = cutoff
test_start = cutoff
test_end = min(cutoff + self.fh, n)
train_indices = slice(0, train_end)
test_indices = slice(test_start, test_end)
logger.debug(
f"Fold {fold_id}: train=[0:{train_end}], test=[{test_start}:{test_end}], "
f"cutoff={cutoff}"
)
yield train_indices, test_indices, cutoff
cutoff += self.step_size
fold_id += 1
[docs]
class SlidingWindowSplit:
"""Sliding window cross-validation splitter.
Each fold uses a fixed-size window for training,
and tests on the next window.
Attributes:
window_size: Training window size.
step_size: Step size between folds.
fh: Forecast horizon for each fold.
"""
[docs]
def __init__(
self,
window_size: int,
step_size: int = 1,
fh: int = 1,
):
"""Initialize sliding window splitter.
Args:
window_size: Training window size.
step_size: Step size between folds.
fh: Forecast horizon for each fold.
"""
self.window_size = window_size
self.step_size = step_size
self.fh = fh
[docs]
def split(self, y: Any) -> Iterator[Tuple[Any, Any, Any]]:
"""Generate train/test splits.
Args:
y: Time series data (Series or DataFrame with time index).
Yields:
Tuples of (train_indices, test_indices, cutoff).
"""
if isinstance(y, pd.Series):
n = len(y)
elif isinstance(y, pd.DataFrame):
n = len(y)
else:
raise TypeError(f"y must be Series or DataFrame, got {type(y).__name__}")
if n < self.window_size + self.fh:
raise ValueError(
f"Data length ({n}) must be >= window_size ({self.window_size}) "
f"+ fh ({self.fh})"
)
train_start = 0
fold_id = 0
while train_start + self.window_size + self.fh <= n:
train_end = train_start + self.window_size
test_start = train_end
test_end = min(test_start + self.fh, n)
cutoff = train_end
train_indices = slice(train_start, train_end)
test_indices = slice(test_start, test_end)
logger.debug(
f"Fold {fold_id}: train=[{train_start}:{train_end}], "
f"test=[{test_start}:{test_end}], cutoff={cutoff}"
)
yield train_indices, test_indices, cutoff
train_start += self.step_size
fold_id += 1