Source code for pysatl_tsp.implementations.processor.time_series_cross_validator

from collections.abc import Iterator
from typing import Any, Optional

from pysatl_tsp.core import Handler, T
from pysatl_tsp.core.processor import MappingHandler
from pysatl_tsp.core.scrubber import ScrubberWindow, SlidingScrubber


[docs] class TimeSeriesCrossValidator(Handler[T, tuple[ScrubberWindow[T], ScrubberWindow[T]]]): """A handler that implements expanding window cross-validation for time series data. This handler produces a sequence of train-validation splits suitable for time series validation, where each split preserves the temporal order of data. It implements an expanding window approach, where the training set grows over time while the validation set has a fixed size and slides forward. The handler ensures that: 1. The training set always has at least `min_train_size` points 2. The validation set always has exactly `val_size` points 3. The validation set always follows the training set temporally 4. Each new split adds `val_size` points to the training set This approach respects the temporal nature of time series data and prevents data leakage from future to past. :param min_train_size: Minimum number of points in the initial training set :param val_size: Number of points in each validation set :param source: The handler providing input data, defaults to None Example: .. code-block:: python import numpy as np import matplotlib.pyplot as plt # Generate a synthetic time series np.random.seed(42) ts = np.cumsum(np.random.normal(0, 1, 100)) # Random walk data_source = SimpleDataProvider(ts) # Create a cross-validator with min_train_size=50 and val_size=10 cv = TimeSeriesCrossValidator(min_train_size=50, val_size=10, source=data_source) # Visualize the different train-validation splits plt.figure(figsize=(14, 8)) x = np.arange(len(ts)) plt.plot(x, ts, "k-", alpha=0.3, label="Full time series") for i, (train, val) in enumerate(cv): train_indices = list(train.indices) val_indices = list(val.indices) # Plot each split plt.plot(train_indices, [ts[i] for i in train_indices], "b-", linewidth=2, alpha=0.7 - i * 0.1) plt.plot(val_indices, [ts[i] for i in val_indices], "r-", linewidth=2, alpha=0.7 - i * 0.1) # Add markers at the split point split_idx = train_indices[-1] plt.axvline(x=split_idx, color="g", linestyle="--", alpha=0.5) # Print information about this split print(f"Split {i + 1}:") print(f" Train: {len(train)} points (indices {train_indices[0]}..{train_indices[-1]})") print(f" Validation: {len(val)} points (indices {val_indices[0]}..{val_indices[-1]})") plt.title("Time Series Cross-Validation: Expanding Window Approach") plt.xlabel("Time") plt.ylabel("Value") # Add custom legend from matplotlib.lines import Line2D custom_lines = [ Line2D([0], [0], color="k", alpha=0.3), Line2D([0], [0], color="b", linewidth=2), Line2D([0], [0], color="r", linewidth=2), Line2D([0], [0], color="g", linestyle="--"), ] plt.legend( custom_lines, ["Full time series", "Training sets", "Validation sets", "Split points"], loc="upper left" ) plt.grid(True, alpha=0.3) plt.show() # Example model evaluation with each split from sklearn.linear_model import LinearRegression for i, (train, val) in enumerate(cv): # Prepare data train_indices = list(train.indices) train_X = np.array(train_indices).reshape(-1, 1) train_y = np.array(list(train.values)) val_indices = list(val.indices) val_X = np.array(val_indices).reshape(-1, 1) val_y = np.array(list(val.values)) # Train a simple model model = LinearRegression() model.fit(train_X, train_y) # Evaluate on validation set val_pred = model.predict(val_X) mse = np.mean((val_pred - val_y) ** 2) print(f"Split {i + 1} - Validation MSE: {mse:.4f}") """ def __init__(self, min_train_size: int, val_size: int, source: Optional[Handler[Any, T]] = None): """Initialize a time series cross-validator. :param min_train_size: Minimum number of points in the initial training set :param val_size: Number of points in each validation set :param source: The handler providing input data, defaults to None """ super().__init__(source) self.min_train_size = min_train_size self.val_size = val_size
[docs] def __iter__(self) -> Iterator[tuple[ScrubberWindow[T], ScrubberWindow[T]]]: """Create an iterator that yields train-validation splits for time series cross-validation. This method creates splits where: 1. The first split has exactly min_train_size points for training 2. Each subsequent split adds val_size points to the training set 3. Each validation set has exactly val_size points and follows the training set :return: Iterator yielding tuples of (training_window, validation_window) :raises ValueError: If no source has been set """ if self.source is None: raise ValueError("Source is not set") scrubber = SlidingScrubber( lambda buffer: len(buffer) > self.min_train_size and (len(buffer) - self.min_train_size) % self.val_size == 0, shift=0, source=self.source, ) handler: MappingHandler[ScrubberWindow[T], tuple[ScrubberWindow[T], ScrubberWindow[T]]] = MappingHandler( map_func=lambda window: (window[: -self.val_size], window[-self.val_size :]) ) yield from (scrubber | handler)