Source code for ml_wrappers.dataset.timestamp_featurizer
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
"""Defines a custom timestamp featurizer for converting timestamp columns to numeric."""
import numpy as np
import pandas as pd
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from scipy.sparse import issparse
from sklearn.base import BaseEstimator, TransformerMixin
[docs]class CustomTimestampFeaturizer(BaseEstimator, TransformerMixin):
"""An estimator for featurizing timestamp columns to numeric data.
:param features: Optional feature column names.
:type features: list[str]
:param return_pandas: Whether to return the transformed dataset as a pandas DataFrame.
:type return_pandas: bool
:param modify_in_place: Whether to modify the original dataset in place.
:type modify_in_place: bool
"""
def __init__(self, features=None, return_pandas=False, modify_in_place=False):
"""Initialize the CustomTimestampFeaturizer.
:param features: Optional feature column names.
:type features: list[str]
:param return_pandas: Whether to return the transformed dataset as a pandas DataFrame.
:type return_pandas: bool
:param modify_in_place: Whether to modify the original dataset in place.
:type modify_in_place: bool
"""
self.features = features
self.return_pandas = return_pandas
self.modify_in_place = modify_in_place
self._time_col_names = []
return
[docs] def fit(self, X, y=None):
"""Fits the CustomTimestampFeaturizer.
:param X: The dataset containing timestamp columns to featurize.
:type X: numpy.ndarray or pandas.DataFrame or scipy.sparse.csr_matrix
:param y: The target values.
:type y: Optional target values (None for unsupervised transformations).
"""
# If the data was previously successfully summarized, then there are no
# timestamp columns as it must be numeric.
# Also, if the dataset is sparse, we can assume there are no timestamps
if str(type(X)).endswith(".DenseData'>") or issparse(X):
return self
tmp_dataset = X
# If numpy array, temporarily convert to pandas for easier and uniform timestamp handling
if isinstance(X, np.ndarray):
tmp_dataset = pd.DataFrame(X, columns=self.features)
self._time_col_names = [column for column in tmp_dataset.columns if is_datetime(tmp_dataset[column])]
# Calculate the min date for each column
self._min = []
for time_col_name in self._time_col_names:
self._min.append(tmp_dataset[time_col_name].map(lambda x: x.timestamp()).min())
return self
[docs] def transform(self, X):
"""Transforms the timestamp columns to numeric type in the given dataset.
Specifically, extracts the year, month, day, hour, minute, second and time
since min timestamp in the training dataset.
:param X: The dataset containing timestamp columns to featurize.
:type X: numpy.ndarray or pandas.DataFrame or scipy.sparse.csr_matrix
:return: The transformed dataset.
:rtype: numpy.ndarray or scipy.sparse.csr_matrix
"""
tmp_dataset = X
if len(self._time_col_names) > 0:
# Temporarily convert to pandas for easier and uniform timestamp handling
if isinstance(X, np.ndarray):
tmp_dataset = pd.DataFrame(X, columns=self.features)
elif not self.modify_in_place:
# If originally pandas, make a copy to avoid changing the original dataset
tmp_dataset = X.copy()
# Get the year, day, month, hour, minute, second
for idx, time_col_name in enumerate(self._time_col_names):
tmp_dataset[time_col_name + '_year'] = tmp_dataset[time_col_name].map(lambda x: x.year)
tmp_dataset[time_col_name + '_month'] = tmp_dataset[time_col_name].map(lambda x: x.month)
tmp_dataset[time_col_name + '_day'] = tmp_dataset[time_col_name].map(lambda x: x.day)
tmp_dataset[time_col_name + '_hour'] = tmp_dataset[time_col_name].map(lambda x: x.hour)
tmp_dataset[time_col_name + '_minute'] = tmp_dataset[time_col_name].map(lambda x: x.minute)
tmp_dataset[time_col_name + '_second'] = tmp_dataset[time_col_name].map(lambda x: x.second)
# Replace column itself with difference from min value, leave as same name
# to keep index so order of other columns remains the same for other transformations
tmp_dataset[time_col_name] = tmp_dataset[time_col_name].map(lambda x: x.timestamp() - self._min[idx])
if not self.return_pandas:
tmp_dataset = tmp_dataset.values
return tmp_dataset