Source code for classifier
import logging
import sys
import torch
from packaging import version
from torch import nn
logger = logging.getLogger(__name__)
try:
from transformers.activations import get_activation
except ImportError:
logger.warning(
"Could not import `get_activation` from `transformers.activations`. Only GELU will be "
+ "available for use in the classifier."
)
[docs]class LinearClassifier(nn.Module):
"""``nn.Module`` to classify sentences by reducing the hidden dimension to 1.
Arguments:
web_hidden_size (int): The output hidden size from the word embedding model. Used as
the input to the first linear layer in this nn.Module.
linear_hidden (int, optional): The number of hidden parameters for this Classifier.
Default is 1536.
dropout (float, optional): The value for dropout applied before the 2nd linear layer.
Default is 0.1.
activation_string (str, optional): A string representing an activation function
in ``get_activation()`` Default is "gelu".
"""
def __init__(
self,
web_hidden_size,
linear_hidden=1536,
dropout=0.1,
activation_string="gelu",
):
super(LinearClassifier, self).__init__()
self.dropout1 = nn.Dropout(dropout) if dropout else nn.Identity()
self.linear1 = nn.Linear(web_hidden_size, linear_hidden)
self.linear2 = nn.Linear(linear_hidden, 1)
# self.sigmoid = nn.Sigmoid()
# support older versions of huggingface/transformers
if activation_string == "gelu":
self.activation = nn.GELU()
else:
self.activation = (
get_activation(activation_string)
if activation_string
else nn.Identity()
)
[docs] def forward(self, x, mask):
"""
Forward function. ``x`` is the input ``sent_vector`` tensor and ``mask`` avoids computations
on padded values. Returns ``sent_scores``.
"""
x = self.linear1(x)
x = self.activation(x)
x = self.dropout1(x)
x = self.linear2(x)
# x = self.sigmoid(x)
sent_scores = x.squeeze(-1) * mask.float()
sent_scores[sent_scores == 0] = -9e3
return sent_scores
[docs]class SimpleLinearClassifier(nn.Module):
"""``nn.Module`` to classify sentences by reducing the hidden dimension to 1. This module
contains a single linear layer and a sigmoid.
Arguments:
web_hidden_size (int): The output hidden size from the word embedding model. Used as
the input to the first linear layer in this nn.Module.
"""
def __init__(self, web_hidden_size):
super(SimpleLinearClassifier, self).__init__()
self.linear = nn.Linear(web_hidden_size, 1)
# self.sigmoid = nn.Sigmoid()
[docs] def forward(self, x, mask):
"""
Forward function. ``x`` is the input ``sent_vector`` tensor and ``mask`` avoids computations
on padded values. Returns ``sent_scores``.
"""
x = self.linear(x).squeeze(-1)
# x = self.sigmoid(x)
sent_scores = x * mask.float()
sent_scores[sent_scores == 0] = -9e3
return sent_scores
[docs]class TransformerEncoderClassifier(nn.Module):
r"""
``nn.Module`` to classify sentences by running the sentence vectors through some
``nn.TransformerEncoder`` layers and then reducing the hidden dimension to 1 with a
linear layer.
Arguments:
d_model (int): The number of expected features in the input
nhead (int, optional): The number of heads in the multiheadattention models. Default is 8.
dim_feedforward (int, optional): The dimension of the feedforward network model.
Default is 2048.
dropout (float, optional): The dropout value. Default is 0.1.
num_layers (int, optional): The number of ``TransformerEncoderLayer``\ s. Default is 2.
reduction (nn.Module, optional): a nn.Module that maps `d_model` inputs to 1 value; if not
specified then a ``nn.Sequential()`` module consisting of a linear layer and a
sigmoid will automatically be created. Default is ``nn.Sequential(linear, sigmoid)``.
"""
def __init__(
self,
d_model,
nhead=8,
dim_feedforward=2048,
dropout=0.1,
num_layers=2,
custom_reduction=None,
):
super(TransformerEncoderClassifier, self).__init__()
if version.parse(torch.__version__) < version.parse("1.5.0"):
logger.error(
"You have PyTorch version %s installed, but `TransformerEncoderClassifier` "
+ "requires at least version 1.5.0.",
torch.__version__,
)
sys.exit(1)
self.nhead = nhead
self.custom_reduction = custom_reduction
encoder_layer = nn.TransformerEncoderLayer(
d_model, nhead, dim_feedforward=dim_feedforward, dropout=dropout
)
layer_norm = nn.LayerNorm(d_model)
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers, norm=layer_norm)
if custom_reduction:
self.reduction = custom_reduction
else:
linear = nn.Linear(d_model, 1)
# sigmoid = nn.Sigmoid()
self.reduction = linear # nn.Sequential(linear, sigmoid)
[docs] def forward(self, x, mask):
"""
Forward function. ``x`` is the input ``sent_vector`` tensor and ``mask`` avoids computations
on padded values. Returns ``sent_scores``.
"""
# add dimension in the middle
attn_mask = mask.unsqueeze(1)
# expand the middle dimension to the same size as the last dimension (the number of
# sentences/source length)
# Example with batch size 2: There are two masks since there are two sequences in the
# batch. Each mask is a list of booleans for each sentence vector. The below line expands
# each of these lists by duplicating them until they are each as long as the number of
# sentences. Now instead of a list of booleans, each mask is a matrix where each row is
# identical. This effectively masks tokens where the entire column is False.
# Slight Explanation (for 2D not 3D): https://discuss.pytorch.org/t/how-to-add-padding-mask-to-nn-transformerencoder-module/63390/3 # noqa: E501
# Detailed Explanation for Beginners: https://github.com/bentrevett/pytorch-seq2seq/blob/master/4%20-%20Packed%20Padded%20Sequences%2C%20Masking%2C%20Inference%20and%20BLEU.ipynb # noqa: E501
# PyTorch MultiheadAttention Docs: https://pytorch.org/docs/stable/nn.html#torch.nn.MultiheadAttention.forward # noqa: E501
attn_mask = attn_mask.expand(-1, attn_mask.size(2), -1)
# repeat the mask for each attention head
attn_mask = attn_mask.repeat(self.nhead, 1, 1)
# attn_mask is shape (batch size*num_heads, target sequence length, source sequence length)
# set all the 0's (False) to negative infinity and the 1's (True) to 0.0 because the
# attn_mask is additive
attn_mask = (
attn_mask.float()
.masked_fill(attn_mask == 0, float("-inf"))
.masked_fill(attn_mask == 1, float(0.0))
)
x = x.transpose(0, 1)
# x is shape (source sequence length, batch size, feature number)
x = self.encoder(x, mask=attn_mask)
# x is still shape (source sequence length, batch size, feature number)
x = x.transpose(0, 1).squeeze()
# x is shape (batch size, source sequence length, feature number)
if self.custom_reduction:
x = self.reduction(x, mask)
else:
x = self.reduction(x)
# x is shape (batch size, source sequence length, 1)
# mask is shape (batch size, source sequence length)
sent_scores = x.squeeze(-1) * mask.float()
sent_scores[sent_scores == 0] = -9e3
return sent_scores