Source code for classifier

import logging
import sys

import torch
from packaging import version
from torch import nn

logger = logging.getLogger(__name__)

try:
    from transformers.activations import get_activation
except ImportError:
    logger.warning(
        "Could not import `get_activation` from `transformers.activations`. Only GELU will be "
        + "available for use in the classifier."
    )


[docs]class LinearClassifier(nn.Module):
    """``nn.Module`` to classify sentences by reducing the hidden dimension to 1.

    Arguments:
        web_hidden_size (int): The output hidden size from the word embedding model. Used as
            the input to the first linear layer in this nn.Module.
        linear_hidden (int, optional): The number of hidden parameters for this Classifier.
            Default is 1536.
        dropout (float, optional): The value for dropout applied before the 2nd linear layer.
            Default is 0.1.
        activation_string (str, optional): A string representing an activation function
            in ``get_activation()`` Default is "gelu".
    """

    def __init__(
        self,
        web_hidden_size,
        linear_hidden=1536,
        dropout=0.1,
        activation_string="gelu",
    ):
        super(LinearClassifier, self).__init__()
        self.dropout1 = nn.Dropout(dropout) if dropout else nn.Identity()
        self.linear1 = nn.Linear(web_hidden_size, linear_hidden)
        self.linear2 = nn.Linear(linear_hidden, 1)
        # self.sigmoid = nn.Sigmoid()

        # support older versions of huggingface/transformers
        if activation_string == "gelu":
            self.activation = nn.GELU()
        else:
            self.activation = (
                get_activation(activation_string)
                if activation_string
                else nn.Identity()
            )

[docs]    def forward(self, x, mask):
        """
        Forward function. ``x`` is the input ``sent_vector`` tensor and ``mask`` avoids computations
        on padded values. Returns ``sent_scores``.
        """
        x = self.linear1(x)
        x = self.activation(x)
        x = self.dropout1(x)
        x = self.linear2(x)
        # x = self.sigmoid(x)
        sent_scores = x.squeeze(-1) * mask.float()
        sent_scores[sent_scores == 0] = -9e3
        return sent_scores


[docs]class SimpleLinearClassifier(nn.Module):
    """``nn.Module`` to classify sentences by reducing the hidden dimension to 1. This module
    contains a single linear layer and a sigmoid.

    Arguments:
        web_hidden_size (int): The output hidden size from the word embedding model. Used as
            the input to the first linear layer in this nn.Module.
    """

    def __init__(self, web_hidden_size):
        super(SimpleLinearClassifier, self).__init__()
        self.linear = nn.Linear(web_hidden_size, 1)
        # self.sigmoid = nn.Sigmoid()

[docs]    def forward(self, x, mask):
        """
        Forward function. ``x`` is the input ``sent_vector`` tensor and ``mask`` avoids computations
        on padded values. Returns ``sent_scores``.
        """
        x = self.linear(x).squeeze(-1)
        # x = self.sigmoid(x)
        sent_scores = x * mask.float()
        sent_scores[sent_scores == 0] = -9e3
        return sent_scores


[docs]class TransformerEncoderClassifier(nn.Module):
    r"""
    ``nn.Module`` to classify sentences by running the sentence vectors through some
    ``nn.TransformerEncoder`` layers and then reducing the hidden dimension to 1 with a
    linear layer.

    Arguments:
        d_model (int): The number of expected features in the input
        nhead (int, optional): The number of heads in the multiheadattention models. Default is 8.
        dim_feedforward (int, optional): The dimension of the feedforward network model.
            Default is 2048.
        dropout (float, optional): The dropout value. Default is 0.1.
        num_layers (int, optional): The number of ``TransformerEncoderLayer``\ s. Default is 2.
        reduction (nn.Module, optional): a nn.Module that maps `d_model` inputs to 1 value; if not
            specified then a ``nn.Sequential()`` module consisting of a linear layer and a
            sigmoid will automatically be created. Default is ``nn.Sequential(linear, sigmoid)``.
    """

    def __init__(
        self,
        d_model,
        nhead=8,
        dim_feedforward=2048,
        dropout=0.1,
        num_layers=2,
        custom_reduction=None,
    ):
        super(TransformerEncoderClassifier, self).__init__()

        if version.parse(torch.__version__) < version.parse("1.5.0"):
            logger.error(
                "You have PyTorch version %s installed, but `TransformerEncoderClassifier` "
                + "requires at least version 1.5.0.",
                torch.__version__,
            )
            sys.exit(1)

        self.nhead = nhead
        self.custom_reduction = custom_reduction

        encoder_layer = nn.TransformerEncoderLayer(
            d_model, nhead, dim_feedforward=dim_feedforward, dropout=dropout
        )
        layer_norm = nn.LayerNorm(d_model)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers, norm=layer_norm)

        if custom_reduction:
            self.reduction = custom_reduction
        else:
            linear = nn.Linear(d_model, 1)
            # sigmoid = nn.Sigmoid()
            self.reduction = linear  # nn.Sequential(linear, sigmoid)

[docs]    def forward(self, x, mask):
        """
        Forward function. ``x`` is the input ``sent_vector`` tensor and ``mask`` avoids computations
        on padded values. Returns ``sent_scores``.
        """
        # add dimension in the middle
        attn_mask = mask.unsqueeze(1)
        # expand the middle dimension to the same size as the last dimension (the number of
        # sentences/source length)
        # Example with batch size 2: There are two masks since there are two sequences in the
        # batch. Each mask is a list of booleans for each sentence vector. The below line expands
        # each of these lists by duplicating them until they are each as long as the number of
        # sentences. Now instead of a list of booleans, each mask is a matrix where each row is
        # identical. This effectively masks tokens where the entire column is False.
        # Slight Explanation (for 2D not 3D): https://discuss.pytorch.org/t/how-to-add-padding-mask-to-nn-transformerencoder-module/63390/3  # noqa: E501
        # Detailed Explanation for Beginners: https://github.com/bentrevett/pytorch-seq2seq/blob/master/4%20-%20Packed%20Padded%20Sequences%2C%20Masking%2C%20Inference%20and%20BLEU.ipynb # noqa: E501
        # PyTorch MultiheadAttention Docs: https://pytorch.org/docs/stable/nn.html#torch.nn.MultiheadAttention.forward # noqa: E501
        attn_mask = attn_mask.expand(-1, attn_mask.size(2), -1)
        # repeat the mask for each attention head
        attn_mask = attn_mask.repeat(self.nhead, 1, 1)
        # attn_mask is shape (batch size*num_heads, target sequence length, source sequence length)
        # set all the 0's (False) to negative infinity and the 1's (True) to 0.0 because the
        # attn_mask is additive
        attn_mask = (
            attn_mask.float()
            .masked_fill(attn_mask == 0, float("-inf"))
            .masked_fill(attn_mask == 1, float(0.0))
        )

        x = x.transpose(0, 1)
        # x is shape (source sequence length, batch size, feature number)

        x = self.encoder(x, mask=attn_mask)
        # x is still shape (source sequence length, batch size, feature number)
        x = x.transpose(0, 1).squeeze()
        # x is shape (batch size, source sequence length, feature number)
        if self.custom_reduction:
            x = self.reduction(x, mask)
        else:
            x = self.reduction(x)
        # x is shape (batch size, source sequence length, 1)
        # mask is shape (batch size, source sequence length)
        sent_scores = x.squeeze(-1) * mask.float()
        sent_scores[sent_scores == 0] = -9e3
        return sent_scores