Source code for timesmith.core.featurizers
"""Featurizer implementations for time series feature engineering."""
import logging
from typing import TYPE_CHECKING, Any, List, Optional, Union
import numpy as np
import pandas as pd
from timesmith.core.base import BaseFeaturizer
from timesmith.core.tags import set_tags
from timesmith.utils.rolling import (
rolling_max,
rolling_mean,
rolling_median,
rolling_min,
rolling_std,
)
from timesmith.utils.ts_utils import ensure_datetime_index
if TYPE_CHECKING:
from timesmith.typing import SeriesLike, TableLike
logger = logging.getLogger(__name__)
[docs]
class LagFeaturizer(BaseFeaturizer):
"""Create lagged features from time series.
Transforms SeriesLike to TableLike by creating lag features.
Supports automatic lead prevention, differences, percentage changes,
and seasonal lags.
"""
[docs]
def __init__(
self,
lags: List[int] = [1, 2, 3, 7, 14],
include_diff: bool = False,
include_pct_change: bool = False,
seasonal_lags: Optional[List[int]] = None,
prevent_leads: bool = True,
):
"""Initialize lag featurizer.
Args:
lags: List of lag periods to create.
include_diff: If True, include differenced features (lag differences).
include_pct_change: If True, include percentage change features.
seasonal_lags: Optional list of seasonal lag periods
(e.g., [12, 24] for monthly).
prevent_leads: If True, ensures no future data leakage (only positive lags).
"""
# Filter out negative lags if prevent_leads is True
if prevent_leads:
self.lags = [lag for lag in lags if lag > 0]
if len(self.lags) < len(lags):
logger.warning(f"Filtered out non-positive lags. Using: {self.lags}")
else:
self.lags = lags
self.include_diff = include_diff
self.include_pct_change = include_pct_change
self.seasonal_lags = seasonal_lags or []
self.prevent_leads = prevent_leads
set_tags(
self,
scitype_input="SeriesLike",
scitype_output="TableLike",
handles_missing=False,
requires_sorted_index=True,
)
[docs]
def fit(
self,
y: Union["SeriesLike", Any],
X: Optional[Union["TableLike", Any]] = None,
**fit_params: Any,
) -> "LagFeaturizer":
"""Fit the featurizer (no-op for lags).
Args:
y: Target time series.
X: Optional exogenous data (ignored).
**fit_params: Additional fit parameters.
Returns:
Self for method chaining.
"""
self._is_fitted = True
return self
[docs]
def transform(self, y: Any, X: Optional[Any] = None) -> pd.DataFrame:
"""Create lag features using vectorized NumPy operations.
Args:
y: SeriesLike data.
X: Optional exogenous data (ignored).
Returns:
TableLike DataFrame with lag features.
"""
self._check_is_fitted()
if isinstance(y, pd.Series):
series = y
index = y.index
elif isinstance(y, pd.DataFrame) and y.shape[1] == 1:
series = y.iloc[:, 0]
index = y.index
else:
raise ValueError("y must be SeriesLike (Series or single-column DataFrame)")
# Convert to numpy array for vectorized operations
values = np.asarray(series, dtype=np.float64)
n = len(values)
# Pre-allocate result dictionary for all features
feature_dict = {"value": values}
# Standard lag features - vectorized
for lag in self.lags:
lagged = np.full(n, np.nan, dtype=np.float64)
lagged[lag:] = values[:-lag] if lag > 0 else values
feature_dict[f"lag_{lag}"] = lagged
# Difference features - vectorized
if self.include_diff:
for lag in self.lags:
if lag > 0:
diff_values = np.full(n, np.nan, dtype=np.float64)
diff_values[lag:] = values[lag:] - values[:-lag]
feature_dict[f"diff_{lag}"] = diff_values
# Percentage change features - vectorized
if self.include_pct_change:
for lag in self.lags:
if lag > 0:
pct_values = np.full(n, np.nan, dtype=np.float64)
prev_values = values[:-lag]
curr_values = values[lag:]
# Avoid division by zero
mask = prev_values != 0
pct_values[lag:][mask] = (
curr_values[mask] - prev_values[mask]
) / prev_values[mask]
feature_dict[f"pct_change_{lag}"] = pct_values
# Seasonal lag features - vectorized
for seasonal_lag in self.seasonal_lags:
if seasonal_lag > 0 or not self.prevent_leads:
lagged = np.full(n, np.nan, dtype=np.float64)
if seasonal_lag > 0:
lagged[seasonal_lag:] = values[:-seasonal_lag]
elif seasonal_lag < 0:
lagged[:seasonal_lag] = values[-seasonal_lag:]
else:
lagged = values.copy()
feature_dict[f"seasonal_lag_{seasonal_lag}"] = lagged
# Create DataFrame from dictionary (faster than column-by-column)
df = pd.DataFrame(feature_dict, index=index)
return df
[docs]
class RollingFeaturizer(BaseFeaturizer):
"""Create rolling window features from time series.
Transforms SeriesLike to TableLike by creating rolling statistics.
"""
[docs]
def __init__(
self,
windows: List[int] = [7, 14, 30],
functions: List[str] = ["mean", "std"],
n_jobs: Optional[int] = None,
):
"""Initialize rolling featurizer.
Args:
windows: List of window sizes.
functions: List of functions to apply
('mean', 'std', 'min', 'max', 'median').
n_jobs: Number of parallel jobs for computing statistics.
None uses all CPUs.
"""
self.windows = windows
self.functions = functions
self.n_jobs = n_jobs
set_tags(
self,
scitype_input="SeriesLike",
scitype_output="TableLike",
handles_missing=False,
requires_sorted_index=True,
)
[docs]
def fit(
self, y: Any, X: Optional[Any] = None, **fit_params: Any
) -> "RollingFeaturizer":
"""Fit the featurizer (no-op for rolling features).
Args:
y: Target time series.
X: Optional exogenous data (ignored).
**fit_params: Additional fit parameters.
Returns:
Self for method chaining.
"""
self._is_fitted = True
return self
[docs]
def transform(self, y: Any, X: Optional[Any] = None) -> pd.DataFrame:
"""Create rolling features using optimized NumPy vectorized operations.
Args:
y: SeriesLike data.
X: Optional exogenous data (ignored).
Returns:
TableLike DataFrame with rolling features.
"""
self._check_is_fitted()
if isinstance(y, pd.Series):
series = y
index = y.index
elif isinstance(y, pd.DataFrame) and y.shape[1] == 1:
series = y.iloc[:, 0]
index = y.index
else:
raise ValueError("y must be SeriesLike (Series or single-column DataFrame)")
# Convert to numpy array for vectorized operations
values = np.asarray(series, dtype=np.float64)
# Pre-allocate result dictionary
feature_dict = {"value": values}
# Use parallelized rolling statistics if many windows/functions
from timesmith.utils.rolling import rolling_statistics
if len(self.windows) * len(self.functions) > 4:
rolling_results = rolling_statistics(
values, self.windows, self.functions, n_jobs=self.n_jobs
)
for key, result in rolling_results.items():
# Fill NaN with 0 for std (matching pandas behavior)
if key.startswith("rolling_std_"):
result = np.nan_to_num(result, nan=0.0)
feature_dict[key] = result
else:
# Small number of operations - use direct calls
function_map = {
"mean": rolling_mean,
"std": rolling_std,
"min": rolling_min,
"max": rolling_max,
"median": rolling_median,
}
for window in self.windows:
for func in self.functions:
if func not in function_map:
logger.warning(f"Unknown function {func}, skipping")
continue
rolling_func = function_map[func]
result = rolling_func(values, window, min_periods=1)
# Fill NaN with 0 for std (matching pandas behavior)
if func == "std":
result = np.nan_to_num(result, nan=0.0)
feature_dict[f"rolling_{func}_{window}"] = result
# Create DataFrame from dictionary (faster than column-by-column)
df = pd.DataFrame(feature_dict, index=index)
return df
[docs]
class TimeFeaturizer(BaseFeaturizer):
"""Create time-based features from datetime index.
Transforms SeriesLike to TableLike by extracting time features.
"""
[docs]
def __init__(self):
"""Initialize time featurizer."""
set_tags(
self,
scitype_input="SeriesLike",
scitype_output="TableLike",
handles_missing=False,
requires_sorted_index=False,
)
[docs]
def fit(
self, y: Any, X: Optional[Any] = None, **fit_params: Any
) -> "TimeFeaturizer":
"""Fit the featurizer (no-op for time features).
Args:
y: Target time series.
X: Optional exogenous data (ignored).
**fit_params: Additional fit parameters.
Returns:
Self for method chaining.
"""
self._is_fitted = True
return self
[docs]
def transform(self, y: Any, X: Optional[Any] = None) -> pd.DataFrame:
"""Create time features.
Args:
y: SeriesLike data with datetime index.
X: Optional exogenous data (ignored).
Returns:
TableLike DataFrame with time features.
"""
self._check_is_fitted()
if isinstance(y, pd.Series):
series = y
index = y.index
elif isinstance(y, pd.DataFrame) and y.shape[1] == 1:
series = y.iloc[:, 0]
index = y.index
else:
raise ValueError("y must be SeriesLike (Series or single-column DataFrame)")
index = ensure_datetime_index(pd.Series(index=index)).index
df = pd.DataFrame({"value": series}, index=index)
df["year"] = index.year
df["month"] = index.month
df["day"] = index.day
df["dayofweek"] = index.dayofweek
df["dayofyear"] = index.dayofyear
df["week"] = index.isocalendar().week
df["quarter"] = index.quarter
df["is_weekend"] = (df["dayofweek"] >= 5).astype(int)
df["is_month_start"] = index.is_month_start.astype(int)
df["is_month_end"] = index.is_month_end.astype(int)
return df
[docs]
class DifferencingFeaturizer(BaseFeaturizer):
"""Create differenced features from time series.
Transforms SeriesLike to TableLike by creating differenced features.
"""
[docs]
def __init__(self, orders: List[int] = [1]):
"""Initialize differencing featurizer.
Args:
orders: List of differencing orders (e.g., [1] for first difference).
"""
self.orders = orders
set_tags(
self,
scitype_input="SeriesLike",
scitype_output="TableLike",
handles_missing=False,
requires_sorted_index=True,
)
[docs]
def fit(
self, y: Any, X: Optional[Any] = None, **fit_params: Any
) -> "DifferencingFeaturizer":
"""Fit the featurizer (no-op for differencing).
Args:
y: Target time series.
X: Optional exogenous data (ignored).
**fit_params: Additional fit parameters.
Returns:
Self for method chaining.
"""
self._is_fitted = True
return self
[docs]
def transform(self, y: Any, X: Optional[Any] = None) -> pd.DataFrame:
"""Create differenced features.
Args:
y: SeriesLike data.
X: Optional exogenous data (ignored).
Returns:
TableLike DataFrame with differenced features.
"""
self._check_is_fitted()
if isinstance(y, pd.Series):
series = y
elif isinstance(y, pd.DataFrame) and y.shape[1] == 1:
series = y.iloc[:, 0]
else:
raise ValueError("y must be SeriesLike (Series or single-column DataFrame)")
df = pd.DataFrame({"value": series})
for order in self.orders:
diff_series = series
for _ in range(order):
diff_series = diff_series.diff()
df[f"diff_{order}"] = diff_series
return df
[docs]
class SeasonalFeaturizer(BaseFeaturizer):
"""Create seasonal features using sine/cosine transformations.
Transforms SeriesLike to TableLike by creating seasonal sine/cosine features.
"""
[docs]
def __init__(self, seasonal_periods: List[int]):
"""Initialize seasonal featurizer.
Args:
seasonal_periods: List of seasonal periods
(e.g., [12, 365] for monthly/yearly).
seasonal_periods: List of seasonal periods (e.g., [12, 365] for monthly/yearly).
"""
self.seasonal_periods = seasonal_periods
set_tags(
self,
scitype_input="SeriesLike",
scitype_output="TableLike",
handles_missing=False,
requires_sorted_index=True,
)
[docs]
def fit(
self, y: Any, X: Optional[Any] = None, **fit_params: Any
) -> "SeasonalFeaturizer":
"""Fit the featurizer (no-op for seasonal features).
Args:
y: Target time series.
X: Optional exogenous data (ignored).
**fit_params: Additional fit parameters.
Returns:
Self for method chaining.
"""
self._is_fitted = True
return self
[docs]
def transform(self, y: Any, X: Optional[Any] = None) -> pd.DataFrame:
"""Create seasonal features.
Args:
y: SeriesLike data.
X: Optional exogenous data (ignored).
Returns:
TableLike DataFrame with seasonal features.
"""
self._check_is_fitted()
if isinstance(y, pd.Series):
series = y
elif isinstance(y, pd.DataFrame) and y.shape[1] == 1:
series = y.iloc[:, 0]
else:
raise ValueError("y must be SeriesLike (Series or single-column DataFrame)")
df = pd.DataFrame({"value": series})
n = len(series)
t = np.arange(n)
for period in self.seasonal_periods:
df[f"seasonal_sin_{period}"] = np.sin(2 * np.pi * t / period)
df[f"seasonal_cos_{period}"] = np.cos(2 * np.pi * t / period)
return df
[docs]
class DegradationRateFeaturizer(BaseFeaturizer):
"""Create degradation rate features (rate of change) from time series.
Transforms SeriesLike to TableLike by creating percentage change features.
"""
[docs]
def __init__(self, periods: List[int] = [1, 3, 5]):
"""Initialize degradation rate featurizer.
Args:
periods: List of periods for rate of change calculation.
"""
self.periods = periods
set_tags(
self,
scitype_input="SeriesLike",
scitype_output="TableLike",
handles_missing=False,
requires_sorted_index=True,
)
[docs]
def fit(
self, y: Any, X: Optional[Any] = None, **fit_params: Any
) -> "DegradationRateFeaturizer":
"""Fit the featurizer (no-op for degradation rates).
Args:
y: Target time series.
X: Optional exogenous data (ignored).
**fit_params: Additional fit parameters.
Returns:
Self for method chaining.
"""
self._is_fitted = True
return self
[docs]
def transform(self, y: Any, X: Optional[Any] = None) -> pd.DataFrame:
"""Create degradation rate features.
Args:
y: SeriesLike data.
X: Optional exogenous data (ignored).
Returns:
TableLike DataFrame with degradation rate features.
"""
self._check_is_fitted()
if isinstance(y, pd.Series):
series = y
elif isinstance(y, pd.DataFrame) and y.shape[1] == 1:
series = y.iloc[:, 0]
else:
raise ValueError("y must be SeriesLike (Series or single-column DataFrame)")
df = pd.DataFrame({"value": series})
for period in self.periods:
# Calculate percentage change (rate of change)
df[f"degradation_rate_{period}"] = series.pct_change(periods=period)
return df