Source code for timesmith.core.featurizers

"""Featurizer implementations for time series feature engineering."""

import logging
from typing import TYPE_CHECKING, Any, List, Optional, Union

import numpy as np
import pandas as pd

from timesmith.core.base import BaseFeaturizer
from timesmith.core.tags import set_tags
from timesmith.utils.rolling import (
    rolling_max,
    rolling_mean,
    rolling_median,
    rolling_min,
    rolling_std,
)
from timesmith.utils.ts_utils import ensure_datetime_index

if TYPE_CHECKING:
    from timesmith.typing import SeriesLike, TableLike

logger = logging.getLogger(__name__)


[docs] class LagFeaturizer(BaseFeaturizer): """Create lagged features from time series. Transforms SeriesLike to TableLike by creating lag features. Supports automatic lead prevention, differences, percentage changes, and seasonal lags. """
[docs] def __init__( self, lags: List[int] = [1, 2, 3, 7, 14], include_diff: bool = False, include_pct_change: bool = False, seasonal_lags: Optional[List[int]] = None, prevent_leads: bool = True, ): """Initialize lag featurizer. Args: lags: List of lag periods to create. include_diff: If True, include differenced features (lag differences). include_pct_change: If True, include percentage change features. seasonal_lags: Optional list of seasonal lag periods (e.g., [12, 24] for monthly). prevent_leads: If True, ensures no future data leakage (only positive lags). """ # Filter out negative lags if prevent_leads is True if prevent_leads: self.lags = [lag for lag in lags if lag > 0] if len(self.lags) < len(lags): logger.warning(f"Filtered out non-positive lags. Using: {self.lags}") else: self.lags = lags self.include_diff = include_diff self.include_pct_change = include_pct_change self.seasonal_lags = seasonal_lags or [] self.prevent_leads = prevent_leads set_tags( self, scitype_input="SeriesLike", scitype_output="TableLike", handles_missing=False, requires_sorted_index=True, )
[docs] def fit( self, y: Union["SeriesLike", Any], X: Optional[Union["TableLike", Any]] = None, **fit_params: Any, ) -> "LagFeaturizer": """Fit the featurizer (no-op for lags). Args: y: Target time series. X: Optional exogenous data (ignored). **fit_params: Additional fit parameters. Returns: Self for method chaining. """ self._is_fitted = True return self
[docs] def transform(self, y: Any, X: Optional[Any] = None) -> pd.DataFrame: """Create lag features using vectorized NumPy operations. Args: y: SeriesLike data. X: Optional exogenous data (ignored). Returns: TableLike DataFrame with lag features. """ self._check_is_fitted() if isinstance(y, pd.Series): series = y index = y.index elif isinstance(y, pd.DataFrame) and y.shape[1] == 1: series = y.iloc[:, 0] index = y.index else: raise ValueError("y must be SeriesLike (Series or single-column DataFrame)") # Convert to numpy array for vectorized operations values = np.asarray(series, dtype=np.float64) n = len(values) # Pre-allocate result dictionary for all features feature_dict = {"value": values} # Standard lag features - vectorized for lag in self.lags: lagged = np.full(n, np.nan, dtype=np.float64) lagged[lag:] = values[:-lag] if lag > 0 else values feature_dict[f"lag_{lag}"] = lagged # Difference features - vectorized if self.include_diff: for lag in self.lags: if lag > 0: diff_values = np.full(n, np.nan, dtype=np.float64) diff_values[lag:] = values[lag:] - values[:-lag] feature_dict[f"diff_{lag}"] = diff_values # Percentage change features - vectorized if self.include_pct_change: for lag in self.lags: if lag > 0: pct_values = np.full(n, np.nan, dtype=np.float64) prev_values = values[:-lag] curr_values = values[lag:] # Avoid division by zero mask = prev_values != 0 pct_values[lag:][mask] = ( curr_values[mask] - prev_values[mask] ) / prev_values[mask] feature_dict[f"pct_change_{lag}"] = pct_values # Seasonal lag features - vectorized for seasonal_lag in self.seasonal_lags: if seasonal_lag > 0 or not self.prevent_leads: lagged = np.full(n, np.nan, dtype=np.float64) if seasonal_lag > 0: lagged[seasonal_lag:] = values[:-seasonal_lag] elif seasonal_lag < 0: lagged[:seasonal_lag] = values[-seasonal_lag:] else: lagged = values.copy() feature_dict[f"seasonal_lag_{seasonal_lag}"] = lagged # Create DataFrame from dictionary (faster than column-by-column) df = pd.DataFrame(feature_dict, index=index) return df
[docs] class RollingFeaturizer(BaseFeaturizer): """Create rolling window features from time series. Transforms SeriesLike to TableLike by creating rolling statistics. """
[docs] def __init__( self, windows: List[int] = [7, 14, 30], functions: List[str] = ["mean", "std"], n_jobs: Optional[int] = None, ): """Initialize rolling featurizer. Args: windows: List of window sizes. functions: List of functions to apply ('mean', 'std', 'min', 'max', 'median'). n_jobs: Number of parallel jobs for computing statistics. None uses all CPUs. """ self.windows = windows self.functions = functions self.n_jobs = n_jobs set_tags( self, scitype_input="SeriesLike", scitype_output="TableLike", handles_missing=False, requires_sorted_index=True, )
[docs] def fit( self, y: Any, X: Optional[Any] = None, **fit_params: Any ) -> "RollingFeaturizer": """Fit the featurizer (no-op for rolling features). Args: y: Target time series. X: Optional exogenous data (ignored). **fit_params: Additional fit parameters. Returns: Self for method chaining. """ self._is_fitted = True return self
[docs] def transform(self, y: Any, X: Optional[Any] = None) -> pd.DataFrame: """Create rolling features using optimized NumPy vectorized operations. Args: y: SeriesLike data. X: Optional exogenous data (ignored). Returns: TableLike DataFrame with rolling features. """ self._check_is_fitted() if isinstance(y, pd.Series): series = y index = y.index elif isinstance(y, pd.DataFrame) and y.shape[1] == 1: series = y.iloc[:, 0] index = y.index else: raise ValueError("y must be SeriesLike (Series or single-column DataFrame)") # Convert to numpy array for vectorized operations values = np.asarray(series, dtype=np.float64) # Pre-allocate result dictionary feature_dict = {"value": values} # Use parallelized rolling statistics if many windows/functions from timesmith.utils.rolling import rolling_statistics if len(self.windows) * len(self.functions) > 4: rolling_results = rolling_statistics( values, self.windows, self.functions, n_jobs=self.n_jobs ) for key, result in rolling_results.items(): # Fill NaN with 0 for std (matching pandas behavior) if key.startswith("rolling_std_"): result = np.nan_to_num(result, nan=0.0) feature_dict[key] = result else: # Small number of operations - use direct calls function_map = { "mean": rolling_mean, "std": rolling_std, "min": rolling_min, "max": rolling_max, "median": rolling_median, } for window in self.windows: for func in self.functions: if func not in function_map: logger.warning(f"Unknown function {func}, skipping") continue rolling_func = function_map[func] result = rolling_func(values, window, min_periods=1) # Fill NaN with 0 for std (matching pandas behavior) if func == "std": result = np.nan_to_num(result, nan=0.0) feature_dict[f"rolling_{func}_{window}"] = result # Create DataFrame from dictionary (faster than column-by-column) df = pd.DataFrame(feature_dict, index=index) return df
[docs] class TimeFeaturizer(BaseFeaturizer): """Create time-based features from datetime index. Transforms SeriesLike to TableLike by extracting time features. """
[docs] def __init__(self): """Initialize time featurizer.""" set_tags( self, scitype_input="SeriesLike", scitype_output="TableLike", handles_missing=False, requires_sorted_index=False, )
[docs] def fit( self, y: Any, X: Optional[Any] = None, **fit_params: Any ) -> "TimeFeaturizer": """Fit the featurizer (no-op for time features). Args: y: Target time series. X: Optional exogenous data (ignored). **fit_params: Additional fit parameters. Returns: Self for method chaining. """ self._is_fitted = True return self
[docs] def transform(self, y: Any, X: Optional[Any] = None) -> pd.DataFrame: """Create time features. Args: y: SeriesLike data with datetime index. X: Optional exogenous data (ignored). Returns: TableLike DataFrame with time features. """ self._check_is_fitted() if isinstance(y, pd.Series): series = y index = y.index elif isinstance(y, pd.DataFrame) and y.shape[1] == 1: series = y.iloc[:, 0] index = y.index else: raise ValueError("y must be SeriesLike (Series or single-column DataFrame)") index = ensure_datetime_index(pd.Series(index=index)).index df = pd.DataFrame({"value": series}, index=index) df["year"] = index.year df["month"] = index.month df["day"] = index.day df["dayofweek"] = index.dayofweek df["dayofyear"] = index.dayofyear df["week"] = index.isocalendar().week df["quarter"] = index.quarter df["is_weekend"] = (df["dayofweek"] >= 5).astype(int) df["is_month_start"] = index.is_month_start.astype(int) df["is_month_end"] = index.is_month_end.astype(int) return df
[docs] class DifferencingFeaturizer(BaseFeaturizer): """Create differenced features from time series. Transforms SeriesLike to TableLike by creating differenced features. """
[docs] def __init__(self, orders: List[int] = [1]): """Initialize differencing featurizer. Args: orders: List of differencing orders (e.g., [1] for first difference). """ self.orders = orders set_tags( self, scitype_input="SeriesLike", scitype_output="TableLike", handles_missing=False, requires_sorted_index=True, )
[docs] def fit( self, y: Any, X: Optional[Any] = None, **fit_params: Any ) -> "DifferencingFeaturizer": """Fit the featurizer (no-op for differencing). Args: y: Target time series. X: Optional exogenous data (ignored). **fit_params: Additional fit parameters. Returns: Self for method chaining. """ self._is_fitted = True return self
[docs] def transform(self, y: Any, X: Optional[Any] = None) -> pd.DataFrame: """Create differenced features. Args: y: SeriesLike data. X: Optional exogenous data (ignored). Returns: TableLike DataFrame with differenced features. """ self._check_is_fitted() if isinstance(y, pd.Series): series = y elif isinstance(y, pd.DataFrame) and y.shape[1] == 1: series = y.iloc[:, 0] else: raise ValueError("y must be SeriesLike (Series or single-column DataFrame)") df = pd.DataFrame({"value": series}) for order in self.orders: diff_series = series for _ in range(order): diff_series = diff_series.diff() df[f"diff_{order}"] = diff_series return df
[docs] class SeasonalFeaturizer(BaseFeaturizer): """Create seasonal features using sine/cosine transformations. Transforms SeriesLike to TableLike by creating seasonal sine/cosine features. """
[docs] def __init__(self, seasonal_periods: List[int]): """Initialize seasonal featurizer. Args: seasonal_periods: List of seasonal periods (e.g., [12, 365] for monthly/yearly). seasonal_periods: List of seasonal periods (e.g., [12, 365] for monthly/yearly). """ self.seasonal_periods = seasonal_periods set_tags( self, scitype_input="SeriesLike", scitype_output="TableLike", handles_missing=False, requires_sorted_index=True, )
[docs] def fit( self, y: Any, X: Optional[Any] = None, **fit_params: Any ) -> "SeasonalFeaturizer": """Fit the featurizer (no-op for seasonal features). Args: y: Target time series. X: Optional exogenous data (ignored). **fit_params: Additional fit parameters. Returns: Self for method chaining. """ self._is_fitted = True return self
[docs] def transform(self, y: Any, X: Optional[Any] = None) -> pd.DataFrame: """Create seasonal features. Args: y: SeriesLike data. X: Optional exogenous data (ignored). Returns: TableLike DataFrame with seasonal features. """ self._check_is_fitted() if isinstance(y, pd.Series): series = y elif isinstance(y, pd.DataFrame) and y.shape[1] == 1: series = y.iloc[:, 0] else: raise ValueError("y must be SeriesLike (Series or single-column DataFrame)") df = pd.DataFrame({"value": series}) n = len(series) t = np.arange(n) for period in self.seasonal_periods: df[f"seasonal_sin_{period}"] = np.sin(2 * np.pi * t / period) df[f"seasonal_cos_{period}"] = np.cos(2 * np.pi * t / period) return df
[docs] class DegradationRateFeaturizer(BaseFeaturizer): """Create degradation rate features (rate of change) from time series. Transforms SeriesLike to TableLike by creating percentage change features. """
[docs] def __init__(self, periods: List[int] = [1, 3, 5]): """Initialize degradation rate featurizer. Args: periods: List of periods for rate of change calculation. """ self.periods = periods set_tags( self, scitype_input="SeriesLike", scitype_output="TableLike", handles_missing=False, requires_sorted_index=True, )
[docs] def fit( self, y: Any, X: Optional[Any] = None, **fit_params: Any ) -> "DegradationRateFeaturizer": """Fit the featurizer (no-op for degradation rates). Args: y: Target time series. X: Optional exogenous data (ignored). **fit_params: Additional fit parameters. Returns: Self for method chaining. """ self._is_fitted = True return self
[docs] def transform(self, y: Any, X: Optional[Any] = None) -> pd.DataFrame: """Create degradation rate features. Args: y: SeriesLike data. X: Optional exogenous data (ignored). Returns: TableLike DataFrame with degradation rate features. """ self._check_is_fitted() if isinstance(y, pd.Series): series = y elif isinstance(y, pd.DataFrame) and y.shape[1] == 1: series = y.iloc[:, 0] else: raise ValueError("y must be SeriesLike (Series or single-column DataFrame)") df = pd.DataFrame({"value": series}) for period in self.periods: # Calculate percentage change (rate of change) df[f"degradation_rate_{period}"] = series.pct_change(periods=period) return df