Source code for ml_wrappers.dataset.timestamp_featurizer

# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

"""Defines a custom timestamp featurizer for converting timestamp columns to numeric."""

import numpy as np
import pandas as pd
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from scipy.sparse import issparse
from sklearn.base import BaseEstimator, TransformerMixin


[docs]class CustomTimestampFeaturizer(BaseEstimator, TransformerMixin): """An estimator for featurizing timestamp columns to numeric data. :param features: Optional feature column names. :type features: list[str] :param return_pandas: Whether to return the transformed dataset as a pandas DataFrame. :type return_pandas: bool :param modify_in_place: Whether to modify the original dataset in place. :type modify_in_place: bool """ def __init__(self, features=None, return_pandas=False, modify_in_place=False): """Initialize the CustomTimestampFeaturizer. :param features: Optional feature column names. :type features: list[str] :param return_pandas: Whether to return the transformed dataset as a pandas DataFrame. :type return_pandas: bool :param modify_in_place: Whether to modify the original dataset in place. :type modify_in_place: bool """ self.features = features self.return_pandas = return_pandas self.modify_in_place = modify_in_place self._time_col_names = [] return
[docs] def fit(self, X, y=None): """Fits the CustomTimestampFeaturizer. :param X: The dataset containing timestamp columns to featurize. :type X: numpy.ndarray or pandas.DataFrame or scipy.sparse.csr_matrix :param y: The target values. :type y: Optional target values (None for unsupervised transformations). """ # If the data was previously successfully summarized, then there are no # timestamp columns as it must be numeric. # Also, if the dataset is sparse, we can assume there are no timestamps if str(type(X)).endswith(".DenseData'>") or issparse(X): return self tmp_dataset = X # If numpy array, temporarily convert to pandas for easier and uniform timestamp handling if isinstance(X, np.ndarray): tmp_dataset = pd.DataFrame(X, columns=self.features) self._time_col_names = [column for column in tmp_dataset.columns if is_datetime(tmp_dataset[column])] # Calculate the min date for each column self._min = [] for time_col_name in self._time_col_names: self._min.append(tmp_dataset[time_col_name].map(lambda x: x.timestamp()).min()) return self
[docs] def transform(self, X): """Transforms the timestamp columns to numeric type in the given dataset. Specifically, extracts the year, month, day, hour, minute, second and time since min timestamp in the training dataset. :param X: The dataset containing timestamp columns to featurize. :type X: numpy.ndarray or pandas.DataFrame or scipy.sparse.csr_matrix :return: The transformed dataset. :rtype: numpy.ndarray or scipy.sparse.csr_matrix """ tmp_dataset = X if len(self._time_col_names) > 0: # Temporarily convert to pandas for easier and uniform timestamp handling if isinstance(X, np.ndarray): tmp_dataset = pd.DataFrame(X, columns=self.features) elif not self.modify_in_place: # If originally pandas, make a copy to avoid changing the original dataset tmp_dataset = X.copy() # Get the year, day, month, hour, minute, second for idx, time_col_name in enumerate(self._time_col_names): tmp_dataset[time_col_name + '_year'] = tmp_dataset[time_col_name].map(lambda x: x.year) tmp_dataset[time_col_name + '_month'] = tmp_dataset[time_col_name].map(lambda x: x.month) tmp_dataset[time_col_name + '_day'] = tmp_dataset[time_col_name].map(lambda x: x.day) tmp_dataset[time_col_name + '_hour'] = tmp_dataset[time_col_name].map(lambda x: x.hour) tmp_dataset[time_col_name + '_minute'] = tmp_dataset[time_col_name].map(lambda x: x.minute) tmp_dataset[time_col_name + '_second'] = tmp_dataset[time_col_name].map(lambda x: x.second) # Replace column itself with difference from min value, leave as same name # to keep index so order of other columns remains the same for other transformations tmp_dataset[time_col_name] = tmp_dataset[time_col_name].map(lambda x: x.timestamp() - self._min[idx]) if not self.return_pandas: tmp_dataset = tmp_dataset.values return tmp_dataset