Source code for timesmith.core.featurizers
"""Featurizer implementations for time series feature engineering."""
import logging
from typing import Any, List, Optional
import numpy as np
import pandas as pd
from timesmith.core.base import BaseFeaturizer
from timesmith.core.tags import set_tags
from timesmith.utils.rolling import (
rolling_max,
rolling_mean,
rolling_median,
rolling_min,
rolling_std,
)
from timesmith.utils.ts_utils import ensure_datetime_index
logger = logging.getLogger(__name__)
[docs]
class LagFeaturizer(BaseFeaturizer):
"""Create lagged features from time series.
Transforms SeriesLike to TableLike by creating lag features.
Supports automatic lead prevention, differences, percentage changes,
and seasonal lags.
"""
[docs]
def __init__(
self,
lags: List[int] = [1, 2, 3, 7, 14],
include_diff: bool = False,
include_pct_change: bool = False,
seasonal_lags: Optional[List[int]] = None,
prevent_leads: bool = True,
):
"""Initialize lag featurizer.
Args:
lags: List of lag periods to create.
include_diff: If True, include differenced features (lag differences).
include_pct_change: If True, include percentage change features.
seasonal_lags: Optional list of seasonal lag periods
(e.g., [12, 24] for monthly).
prevent_leads: If True, ensures no future data leakage (only positive lags).
"""
# Filter out negative lags if prevent_leads is True
if prevent_leads:
self.lags = [lag for lag in lags if lag > 0]
if len(self.lags) < len(lags):
logger.warning(f"Filtered out non-positive lags. Using: {self.lags}")
else:
self.lags = lags
self.include_diff = include_diff
self.include_pct_change = include_pct_change
self.seasonal_lags = seasonal_lags or []
self.prevent_leads = prevent_leads
set_tags(
self,
scitype_input="SeriesLike",
scitype_output="TableLike",
handles_missing=False,
requires_sorted_index=True,
)
[docs]
def fit(
self, y: Any, X: Optional[Any] = None, **fit_params: Any
) -> "LagFeaturizer":
"""Fit the featurizer (no-op for lags).
Args:
y: Target time series.
X: Optional exogenous data (ignored).
**fit_params: Additional fit parameters.
Returns:
Self for method chaining.
"""
self._is_fitted = True
return self
[docs]
def transform(self, y: Any, X: Optional[Any] = None) -> pd.DataFrame:
"""Create lag features using vectorized NumPy operations.
Args:
y: SeriesLike data.
X: Optional exogenous data (ignored).
Returns:
TableLike DataFrame with lag features.
"""
self._check_is_fitted()
if isinstance(y, pd.Series):
series = y
index = y.index
elif isinstance(y, pd.DataFrame) and y.shape[1] == 1:
series = y.iloc[:, 0]
index = y.index
else:
raise ValueError("y must be SeriesLike (Series or single-column DataFrame)")
# Convert to numpy array for vectorized operations
values = np.asarray(series, dtype=np.float64)
n = len(values)
# Pre-allocate result dictionary for all features
feature_dict = {"value": values}
# Standard lag features - vectorized
for lag in self.lags:
lagged = np.full(n, np.nan, dtype=np.float64)
lagged[lag:] = values[:-lag] if lag > 0 else values
feature_dict[f"lag_{lag}"] = lagged
# Difference features - vectorized
if self.include_diff:
for lag in self.lags:
if lag > 0:
diff_values = np.full(n, np.nan, dtype=np.float64)
diff_values[lag:] = values[lag:] - values[:-lag]
feature_dict[f"diff_{lag}"] = diff_values
# Percentage change features - vectorized
if self.include_pct_change:
for lag in self.lags:
if lag > 0:
pct_values = np.full(n, np.nan, dtype=np.float64)
prev_values = values[:-lag]
curr_values = values[lag:]
# Avoid division by zero
mask = prev_values != 0
pct_values[lag:][mask] = (
curr_values[mask] - prev_values[mask]
) / prev_values[mask]
feature_dict[f"pct_change_{lag}"] = pct_values
# Seasonal lag features - vectorized
for seasonal_lag in self.seasonal_lags:
if seasonal_lag > 0 or not self.prevent_leads:
lagged = np.full(n, np.nan, dtype=np.float64)
if seasonal_lag > 0:
lagged[seasonal_lag:] = values[:-seasonal_lag]
elif seasonal_lag < 0:
lagged[:seasonal_lag] = values[-seasonal_lag:]
else:
lagged = values.copy()
feature_dict[f"seasonal_lag_{seasonal_lag}"] = lagged
# Create DataFrame from dictionary (faster than column-by-column)
df = pd.DataFrame(feature_dict, index=index)
return df
[docs]
class RollingFeaturizer(BaseFeaturizer):
"""Create rolling window features from time series.
Transforms SeriesLike to TableLike by creating rolling statistics.
"""
[docs]
def __init__(
self,
windows: List[int] = [7, 14, 30],
functions: List[str] = ["mean", "std"],
n_jobs: Optional[int] = None,
):
"""Initialize rolling featurizer.
Args:
windows: List of window sizes.
functions: List of functions to apply
('mean', 'std', 'min', 'max', 'median').
n_jobs: Number of parallel jobs for computing statistics.
None uses all CPUs.
"""
self.windows = windows
self.functions = functions
self.n_jobs = n_jobs
set_tags(
self,
scitype_input="SeriesLike",
scitype_output="TableLike",
handles_missing=False,
requires_sorted_index=True,
)
[docs]
def fit(
self, y: Any, X: Optional[Any] = None, **fit_params: Any
) -> "RollingFeaturizer":
"""Fit the featurizer (no-op for rolling features).
Args:
y: Target time series.
X: Optional exogenous data (ignored).
**fit_params: Additional fit parameters.
Returns:
Self for method chaining.
"""
self._is_fitted = True
return self
[docs]
def transform(self, y: Any, X: Optional[Any] = None) -> pd.DataFrame:
"""Create rolling features using optimized NumPy vectorized operations.
Args:
y: SeriesLike data.
X: Optional exogenous data (ignored).
Returns:
TableLike DataFrame with rolling features.
"""
self._check_is_fitted()
if isinstance(y, pd.Series):
series = y
index = y.index
elif isinstance(y, pd.DataFrame) and y.shape[1] == 1:
series = y.iloc[:, 0]
index = y.index
else:
raise ValueError("y must be SeriesLike (Series or single-column DataFrame)")
# Convert to numpy array for vectorized operations
values = np.asarray(series, dtype=np.float64)
# Pre-allocate result dictionary
feature_dict = {"value": values}
# Use parallelized rolling statistics if many windows/functions
from timesmith.utils.rolling import rolling_statistics
if len(self.windows) * len(self.functions) > 4:
rolling_results = rolling_statistics(
values, self.windows, self.functions, n_jobs=self.n_jobs
)
for key, result in rolling_results.items():
# Fill NaN with 0 for std (matching pandas behavior)
if key.startswith("rolling_std_"):
result = np.nan_to_num(result, nan=0.0)
feature_dict[key] = result
else:
# Small number of operations - use direct calls
function_map = {
"mean": rolling_mean,
"std": rolling_std,
"min": rolling_min,
"max": rolling_max,
"median": rolling_median,
}
for window in self.windows:
for func in self.functions:
if func not in function_map:
logger.warning(f"Unknown function {func}, skipping")
continue
rolling_func = function_map[func]
result = rolling_func(values, window, min_periods=1)
# Fill NaN with 0 for std (matching pandas behavior)
if func == "std":
result = np.nan_to_num(result, nan=0.0)
feature_dict[f"rolling_{func}_{window}"] = result
# Create DataFrame from dictionary (faster than column-by-column)
df = pd.DataFrame(feature_dict, index=index)
return df
[docs]
class TimeFeaturizer(BaseFeaturizer):
"""Create time-based features from datetime index.
Transforms SeriesLike to TableLike by extracting time features.
"""
[docs]
def __init__(self):
"""Initialize time featurizer."""
set_tags(
self,
scitype_input="SeriesLike",
scitype_output="TableLike",
handles_missing=False,
requires_sorted_index=False,
)
[docs]
def fit(
self, y: Any, X: Optional[Any] = None, **fit_params: Any
) -> "TimeFeaturizer":
"""Fit the featurizer (no-op for time features).
Args:
y: Target time series.
X: Optional exogenous data (ignored).
**fit_params: Additional fit parameters.
Returns:
Self for method chaining.
"""
self._is_fitted = True
return self
[docs]
def transform(self, y: Any, X: Optional[Any] = None) -> pd.DataFrame:
"""Create time features.
Args:
y: SeriesLike data with datetime index.
X: Optional exogenous data (ignored).
Returns:
TableLike DataFrame with time features.
"""
self._check_is_fitted()
if isinstance(y, pd.Series):
series = y
index = y.index
elif isinstance(y, pd.DataFrame) and y.shape[1] == 1:
series = y.iloc[:, 0]
index = y.index
else:
raise ValueError("y must be SeriesLike (Series or single-column DataFrame)")
index = ensure_datetime_index(pd.Series(index=index)).index
df = pd.DataFrame({"value": series}, index=index)
df["year"] = index.year
df["month"] = index.month
df["day"] = index.day
df["dayofweek"] = index.dayofweek
df["dayofyear"] = index.dayofyear
df["week"] = index.isocalendar().week
df["quarter"] = index.quarter
df["is_weekend"] = (df["dayofweek"] >= 5).astype(int)
df["is_month_start"] = index.is_month_start.astype(int)
df["is_month_end"] = index.is_month_end.astype(int)
return df
[docs]
class DifferencingFeaturizer(BaseFeaturizer):
"""Create differenced features from time series.
Transforms SeriesLike to TableLike by creating differenced features.
"""
[docs]
def __init__(self, orders: List[int] = [1]):
"""Initialize differencing featurizer.
Args:
orders: List of differencing orders (e.g., [1] for first difference).
"""
self.orders = orders
set_tags(
self,
scitype_input="SeriesLike",
scitype_output="TableLike",
handles_missing=False,
requires_sorted_index=True,
)
[docs]
def fit(
self, y: Any, X: Optional[Any] = None, **fit_params: Any
) -> "DifferencingFeaturizer":
"""Fit the featurizer (no-op for differencing).
Args:
y: Target time series.
X: Optional exogenous data (ignored).
**fit_params: Additional fit parameters.
Returns:
Self for method chaining.
"""
self._is_fitted = True
return self
[docs]
def transform(self, y: Any, X: Optional[Any] = None) -> pd.DataFrame:
"""Create differenced features.
Args:
y: SeriesLike data.
X: Optional exogenous data (ignored).
Returns:
TableLike DataFrame with differenced features.
"""
self._check_is_fitted()
if isinstance(y, pd.Series):
series = y
elif isinstance(y, pd.DataFrame) and y.shape[1] == 1:
series = y.iloc[:, 0]
else:
raise ValueError("y must be SeriesLike (Series or single-column DataFrame)")
df = pd.DataFrame({"value": series})
for order in self.orders:
diff_series = series
for _ in range(order):
diff_series = diff_series.diff()
df[f"diff_{order}"] = diff_series
return df
[docs]
class SeasonalFeaturizer(BaseFeaturizer):
"""Create seasonal features using sine/cosine transformations.
Transforms SeriesLike to TableLike by creating seasonal sine/cosine features.
"""
[docs]
def __init__(self, seasonal_periods: List[int]):
"""Initialize seasonal featurizer.
Args:
seasonal_periods: List of seasonal periods
(e.g., [12, 365] for monthly/yearly).
seasonal_periods: List of seasonal periods (e.g., [12, 365] for monthly/yearly).
"""
self.seasonal_periods = seasonal_periods
set_tags(
self,
scitype_input="SeriesLike",
scitype_output="TableLike",
handles_missing=False,
requires_sorted_index=True,
)
[docs]
def fit(
self, y: Any, X: Optional[Any] = None, **fit_params: Any
) -> "SeasonalFeaturizer":
"""Fit the featurizer (no-op for seasonal features).
Args:
y: Target time series.
X: Optional exogenous data (ignored).
**fit_params: Additional fit parameters.
Returns:
Self for method chaining.
"""
self._is_fitted = True
return self
[docs]
def transform(self, y: Any, X: Optional[Any] = None) -> pd.DataFrame:
"""Create seasonal features.
Args:
y: SeriesLike data.
X: Optional exogenous data (ignored).
Returns:
TableLike DataFrame with seasonal features.
"""
self._check_is_fitted()
if isinstance(y, pd.Series):
series = y
elif isinstance(y, pd.DataFrame) and y.shape[1] == 1:
series = y.iloc[:, 0]
else:
raise ValueError("y must be SeriesLike (Series or single-column DataFrame)")
df = pd.DataFrame({"value": series})
n = len(series)
t = np.arange(n)
for period in self.seasonal_periods:
df[f"seasonal_sin_{period}"] = np.sin(2 * np.pi * t / period)
df[f"seasonal_cos_{period}"] = np.cos(2 * np.pi * t / period)
return df
[docs]
class DegradationRateFeaturizer(BaseFeaturizer):
"""Create degradation rate features (rate of change) from time series.
Transforms SeriesLike to TableLike by creating percentage change features.
"""
[docs]
def __init__(self, periods: List[int] = [1, 3, 5]):
"""Initialize degradation rate featurizer.
Args:
periods: List of periods for rate of change calculation.
"""
self.periods = periods
set_tags(
self,
scitype_input="SeriesLike",
scitype_output="TableLike",
handles_missing=False,
requires_sorted_index=True,
)
[docs]
def fit(
self, y: Any, X: Optional[Any] = None, **fit_params: Any
) -> "DegradationRateFeaturizer":
"""Fit the featurizer (no-op for degradation rates).
Args:
y: Target time series.
X: Optional exogenous data (ignored).
**fit_params: Additional fit parameters.
Returns:
Self for method chaining.
"""
self._is_fitted = True
return self
[docs]
def transform(self, y: Any, X: Optional[Any] = None) -> pd.DataFrame:
"""Create degradation rate features.
Args:
y: SeriesLike data.
X: Optional exogenous data (ignored).
Returns:
TableLike DataFrame with degradation rate features.
"""
self._check_is_fitted()
if isinstance(y, pd.Series):
series = y
elif isinstance(y, pd.DataFrame) and y.shape[1] == 1:
series = y.iloc[:, 0]
else:
raise ValueError("y must be SeriesLike (Series or single-column DataFrame)")
df = pd.DataFrame({"value": series})
for period in self.periods:
# Calculate percentage change (rate of change)
df[f"degradation_rate_{period}"] = series.pct_change(periods=period)
return df