Source code for ml_wrappers.model.text_model_wrapper

# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

"""Defines wrappers for text-based models."""

import numpy as np
from ml_wrappers.common.constants import ModelTask
from ml_wrappers.common.warnings_suppressor import shap_warnings_suppressor
from ml_wrappers.model.model_utils import (MULTILABEL_THRESHOLD,
                                           _is_transformers_pipeline)

with shap_warnings_suppressor():
    try:
        from shap import models
        shap_installed = True
    except BaseException:
        shap_installed = False


def _wrap_text_model(model, examples, model_task, is_function):
    """If needed, wraps the model or function in a common API.

    Wraps the model based on model task and prediction function contract.

    :param model: The model or function to evaluate on the examples.
    :type model: function or model with a predict or predict_proba function
    :param examples: The model evaluation examples.
        Note the examples will be wrapped in a DatasetWrapper, if not
        wrapped when input.
    :type examples: ml_wrappers.DatasetWrapper or numpy.ndarray
        or pandas.DataFrame or panads.Series or scipy.sparse.csr_matrix
        or shap.DenseData or torch.Tensor
    :param model_task: Parameter to specify whether the model is a
        'text_classification', 'sentiment_analysis', 'question_answering',
        'entailment' or 'summarizations' text model.
    :type model_task: str
    :return: The function chosen from given model and chosen domain, or
        model wrapping the function and chosen domain.
    :rtype: (function, str) or (model, str)
    """
    _wrapped_model = model
    if _is_transformers_pipeline(model):
        if model_task == ModelTask.TEXT_CLASSIFICATION:
            _wrapped_model = WrappedTextClassificationModel(model)
        elif model_task == ModelTask.QUESTION_ANSWERING:
            _wrapped_model = WrappedQuestionAnsweringModel(model)
        elif model_task == ModelTask.MULTILABEL_TEXT_CLASSIFICATION:
            _wrapped_model = WrappedTextClassificationModel(model, multilabel=True)
    return _wrapped_model, model_task


[docs]class WrappedTextClassificationModel(object): """A class for wrapping a Transformers model in the scikit-learn style.""" def __init__(self, model, multilabel=False): """Initialize the WrappedTextClassificationModel.""" self._model = model if not shap_installed: raise ImportError("SHAP is not installed. Please install it " "to use WrappedTextClassificationModel.") self._wrapped_model = models.TransformersPipeline(model) self._multilabel = multilabel
[docs] def predict(self, dataset): """Predict the output using the wrapped Transformers model. :param dataset: The dataset to predict on. :type dataset: ml_wrappers.DatasetWrapper """ pipeline_dicts = self._wrapped_model.inner_model(dataset) output = [] for val in pipeline_dicts: if not isinstance(val, list): val = [val] scores = [obj["score"] for obj in val] if self._multilabel: threshold = MULTILABEL_THRESHOLD # jagged, thresholded array of labels model predicted labels = np.where(np.array(scores) > threshold) predictions = np.zeros(len(scores)) # indicator matrix of labels since numpy does not # support jagged arrays, which seems to be the format # scikit-learn MultiOutputClassifier uses, # see sklearn.multioutput.MultiOutputClassifier.predict predictions[labels] = 1 output.append(predictions) else: max_score_index = np.argmax(scores) output.append(max_score_index) return np.array(output)
[docs] def predict_proba(self, dataset): """Predict the output probability using the Transformers model. :param dataset: The dataset to predict_proba on. :type dataset: ml_wrappers.DatasetWrapper """ return self._wrapped_model(dataset)
[docs]class WrappedQuestionAnsweringModel(object): """A class for wrapping a Transformers model in the scikit-learn style.""" def __init__(self, model): """Initialize the WrappedQuestionAnsweringModel.""" self._model = model
[docs] def predict(self, dataset): """Predict the output using the wrapped Transformers model. :param dataset: The dataset to predict on. :type dataset: ml_wrappers.DatasetWrapper """ output = [] for context, question in zip(dataset['context'], dataset['questions']): answer = self._model({'context': context, 'question': question}) output.append(answer['answer']) return np.array(output)