第四天-機(jī)器翻譯,注意力機(jī)制和Seq2seq模型,Transformer

機(jī)器翻譯和數(shù)據(jù)集

機(jī)器翻譯(MT):將一段文本從一種語言自動翻譯為另一種語言,用神經(jīng)網(wǎng)絡(luò)解決這個問題通常稱為神經(jīng)機(jī)器翻譯(NMT)。
主要特征:輸出是單詞序列而不是單個單詞。 輸出序列的長度可能與源序列的長度不同。

import os
os.listdir('/home/kesci/input/')
['d2lzh1981',
 'fraeng6506',
 'houseprices2807',
 'd2l9528',
 'FashionMNIST2065',
 'jaychou_lyrics4703',
 'd2l_jay9460']
import sys
sys.path.append('/home/kesci/input/d2l9528/')
import collections
import d2l
import zipfile
from d2l.data.base import Vocab
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
from torch import optim

數(shù)據(jù)預(yù)處理

將數(shù)據(jù)集清洗、轉(zhuǎn)化為神經(jīng)網(wǎng)絡(luò)的輸入minbatch

with open('/home/kesci/input/fraeng6506/fra.txt', 'r') as f:
      raw_text = f.read()
print(raw_text[0:1000])
def preprocess_raw(text):
    text = text.replace('\u202f', ' ').replace('\xa0', ' ')
    out = ''
    for i, char in enumerate(text.lower()):
        if char in (',', '!', '.') and i > 0 and text[i-1] != ' ':
            out += ' '
        out += char
    return out

text = preprocess_raw(raw_text)
print(text[0:1000])

字符在計算機(jī)里是以編碼的形式存在,我們通常所用的空格是 \x20 ,是在標(biāo)準(zhǔn)ASCII可見字符 0x20~0x7e 范圍內(nèi)。
而 \xa0 屬于 latin1 (ISO/IEC_8859-1)中的擴(kuò)展字符集字符,代表不間斷空白符nbsp(non-breaking space),超出gbk編碼范圍,是需要去除的特殊字符。再數(shù)據(jù)預(yù)處理的過程中,我們首先需要對數(shù)據(jù)進(jìn)行清洗。

分詞

字符串---單詞組成的列表

num_examples = 50000
source, target = [], []
for i, line in enumerate(text.split('\n')):
    if i > num_examples:
        break
    parts = line.split('\t')
    if len(parts) >= 2:
        source.append(parts[0].split(' '))
        target.append(parts[1].split(' '))
        
source[0:3], target[0:3]
d2l.set_figsize()
d2l.plt.hist([[len(l) for l in source], [len(l) for l in target]],label=['source', 'target'])
d2l.plt.legend(loc='upper right');

建立詞典

單詞組成的列表---單詞id組成的列表

def build_vocab(tokens):
    tokens = [token for line in tokens for token in line]
    return d2l.data.base.Vocab(tokens, min_freq=3, use_special_tokens=True)

src_vocab = build_vocab(source)
len(src_vocab)
Image Name

載入數(shù)據(jù)集

def pad(line, max_len, padding_token):
    if len(line) > max_len:
        return line[:max_len]
    return line + [padding_token] * (max_len - len(line))
pad(src_vocab[source[0]], 10, src_vocab.pad)
def build_array(lines, vocab, max_len, is_source):
    lines = [vocab[line] for line in lines]
    if not is_source:
        lines = [[vocab.bos] + line + [vocab.eos] for line in lines]
    array = torch.tensor([pad(line, max_len, vocab.pad) for line in lines])
    valid_len = (array != vocab.pad).sum(1) #第一個維度
    return array, valid_len
Image Name
def load_data_nmt(batch_size, max_len): # This function is saved in d2l.
    src_vocab, tgt_vocab = build_vocab(source), build_vocab(target)
    src_array, src_valid_len = build_array(source, src_vocab, max_len, True)
    tgt_array, tgt_valid_len = build_array(target, tgt_vocab, max_len, False)
    train_data = data.TensorDataset(src_array, src_valid_len, tgt_array, tgt_valid_len)
    train_iter = data.DataLoader(train_data, batch_size, shuffle=True)
    return src_vocab, tgt_vocab, train_iter
src_vocab, tgt_vocab, train_iter = load_data_nmt(batch_size=2, max_len=8)
for X, X_valid_len, Y, Y_valid_len, in train_iter:
    print('X =', X.type(torch.int32), '\nValid lengths for X =', X_valid_len,
        '\nY =', Y.type(torch.int32), '\nValid lengths for Y =', Y_valid_len)
    break

Encoder-Decoder

encoder:輸入到隱藏狀態(tài)
decoder:隱藏狀態(tài)到輸出

Image Name
class Encoder(nn.Module):
    def __init__(self, **kwargs):
        super(Encoder, self).__init__(**kwargs)

    def forward(self, X, *args):
        raise NotImplementedError
class Decoder(nn.Module):
    def __init__(self, **kwargs):
        super(Decoder, self).__init__(**kwargs)

    def init_state(self, enc_outputs, *args):
        raise NotImplementedError

    def forward(self, X, state):
        raise NotImplementedError
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, **kwargs):
        super(EncoderDecoder, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, enc_X, dec_X, *args):
        enc_outputs = self.encoder(enc_X, *args)
        dec_state = self.decoder.init_state(enc_outputs, *args)
        return self.decoder(dec_X, dec_state)

可以應(yīng)用在對話系統(tǒng)、生成式任務(wù)中。

Sequence to Sequence模型

模型:

訓(xùn)練

Image Name

預(yù)測

Image Name

具體結(jié)構(gòu):

Image Name

Encoder

class Seq2SeqEncoder(d2l.Encoder):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0, **kwargs):
        super(Seq2SeqEncoder, self).__init__(**kwargs)
        self.num_hiddens=num_hiddens
        self.num_layers=num_layers
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.LSTM(embed_size,num_hiddens, num_layers, dropout=dropout)
   
    def begin_state(self, batch_size, device):
        return [torch.zeros(size=(self.num_layers, batch_size, self.num_hiddens),  device=device),
                torch.zeros(size=(self.num_layers, batch_size, self.num_hiddens),  device=device)]
    def forward(self, X, *args):
        X = self.embedding(X) # X shape: (batch_size, seq_len, embed_size)
        X = X.transpose(0, 1)  # RNN needs first axes to be time
        # state = self.begin_state(X.shape[1], device=X.device)
        out, state = self.rnn(X)
        # The shape of out is (seq_len, batch_size, num_hiddens).
        # state contains the hidden state and the memory cell
        # of the last time step, the shape is (num_layers, batch_size, num_hiddens)
        return out, state
encoder = Seq2SeqEncoder(vocab_size=10, embed_size=8,num_hiddens=16, num_layers=2)
X = torch.zeros((4, 7),dtype=torch.long)
output, state = encoder(X)
output.shape, len(state), state[0].shape, state[1].shape

Decoder

class Seq2SeqDecoder(d2l.Decoder):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0, **kwargs):
        super(Seq2SeqDecoder, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.LSTM(embed_size,num_hiddens, num_layers, dropout=dropout)
        self.dense = nn.Linear(num_hiddens,vocab_size)

    def init_state(self, enc_outputs, *args):
        return enc_outputs[1]

    def forward(self, X, state):
        X = self.embedding(X).transpose(0, 1)
        out, state = self.rnn(X, state)
        # Make the batch to be the first dimension to simplify loss computation.
        out = self.dense(out).transpose(0, 1)
        return out, state
decoder = Seq2SeqDecoder(vocab_size=10, embed_size=8,num_hiddens=16, num_layers=2)
state = decoder.init_state(encoder(X))
out, state = decoder(X, state)
out.shape, len(state), state[0].shape, state[1].shape

損失函數(shù)

def SequenceMask(X, X_len,value=0):
    maxlen = X.size(1)
    mask = torch.arange(maxlen)[None, :].to(X_len.device) < X_len[:, None]   
    X[~mask]=value
    return X
X = torch.tensor([[1,2,3], [4,5,6]])
SequenceMask(X,torch.tensor([1,2]))
X = torch.ones((2,3, 4))
SequenceMask(X, torch.tensor([1,2]),value=-1)
class MaskedSoftmaxCELoss(nn.CrossEntropyLoss):
    # pred shape: (batch_size, seq_len, vocab_size)
    # label shape: (batch_size, seq_len)
    # valid_length shape: (batch_size, )
    def forward(self, pred, label, valid_length):
        # the sample weights shape should be (batch_size, seq_len)
        weights = torch.ones_like(label)
        weights = SequenceMask(weights, valid_length).float()
        self.reduction='none'
        output=super(MaskedSoftmaxCELoss, self).forward(pred.transpose(1,2), label)
        return (output*weights).mean(dim=1)
loss = MaskedSoftmaxCELoss()
loss(torch.ones((3, 4, 10)), torch.ones((3,4),dtype=torch.long), torch.tensor([4,3,0]))

訓(xùn)練

def train_ch7(model, data_iter, lr, num_epochs, device):  # Saved in d2l
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss = MaskedSoftmaxCELoss()
    tic = time.time()
    for epoch in range(1, num_epochs+1):
        l_sum, num_tokens_sum = 0.0, 0.0
        for batch in data_iter:
            optimizer.zero_grad()
            X, X_vlen, Y, Y_vlen = [x.to(device) for x in batch]
            Y_input, Y_label, Y_vlen = Y[:,:-1], Y[:,1:], Y_vlen-1
            
            Y_hat, _ = model(X, Y_input, X_vlen, Y_vlen)
            l = loss(Y_hat, Y_label, Y_vlen).sum()
            l.backward()

            with torch.no_grad():
                d2l.grad_clipping_nn(model, 5, device)
            num_tokens = Y_vlen.sum().item()
            optimizer.step()
            l_sum += l.sum().item()
            num_tokens_sum += num_tokens
        if epoch % 50 == 0:
            print("epoch {0:4d},loss {1:.3f}, time {2:.1f} sec".format( 
                  epoch, (l_sum/num_tokens_sum), time.time()-tic))
            tic = time.time()
embed_size, num_hiddens, num_layers, dropout = 32, 32, 2, 0.0
batch_size, num_examples, max_len = 64, 1e3, 10
lr, num_epochs, ctx = 0.005, 300, d2l.try_gpu()
src_vocab, tgt_vocab, train_iter = d2l.load_data_nmt(
    batch_size, max_len,num_examples)
encoder = Seq2SeqEncoder(
    len(src_vocab), embed_size, num_hiddens, num_layers, dropout)
decoder = Seq2SeqDecoder(
    len(tgt_vocab), embed_size, num_hiddens, num_layers, dropout)
model = d2l.EncoderDecoder(encoder, decoder)
train_ch7(model, train_iter, lr, num_epochs, ctx)

測試

def translate_ch7(model, src_sentence, src_vocab, tgt_vocab, max_len, device):
    src_tokens = src_vocab[src_sentence.lower().split(' ')]
    src_len = len(src_tokens)
    if src_len < max_len:
        src_tokens += [src_vocab.pad] * (max_len - src_len)
    enc_X = torch.tensor(src_tokens, device=device)
    enc_valid_length = torch.tensor([src_len], device=device)
    # use expand_dim to add the batch_size dimension.
    enc_outputs = model.encoder(enc_X.unsqueeze(dim=0), enc_valid_length)
    dec_state = model.decoder.init_state(enc_outputs, enc_valid_length)
    dec_X = torch.tensor([tgt_vocab.bos], device=device).unsqueeze(dim=0)
    predict_tokens = []
    for _ in range(max_len):
        Y, dec_state = model.decoder(dec_X, dec_state)
        # The token with highest score is used as the next time step input.
        dec_X = Y.argmax(dim=2)
        py = dec_X.squeeze(dim=0).int().item()
        if py == tgt_vocab.eos:
            break
        predict_tokens.append(py)
    return ' '.join(tgt_vocab.to_tokens(predict_tokens))
for sentence in ['Go .', 'Wow !', "I'm OK .", 'I won !']:
    print(sentence + ' => ' + translate_ch7(
        model, sentence, src_vocab, tgt_vocab, max_len, ctx))

Beam Search

簡單greedy search:

Image Name

維特比算法:選擇整體分?jǐn)?shù)最高的句子(搜索空間太大)
集束搜索:

Image Name
print("END")

注意力機(jī)制

在“編碼器—解碼器(seq2seq)”?節(jié)?,解碼器在各個時間步依賴相同的背景變量(context vector)來獲取輸?序列信息。當(dāng)編碼器為循環(huán)神經(jīng)?絡(luò)時,背景變量來?它最終時間步的隱藏狀態(tài)。將源序列輸入信息以循環(huán)單位狀態(tài)編碼,然后將其傳遞給解碼器以生成目標(biāo)序列。然而這種結(jié)構(gòu)存在著問題,尤其是RNN機(jī)制實際中存在長程梯度消失的問題,對于較長的句子,我們很難寄希望于將輸入的序列轉(zhuǎn)化為定長的向量而保存所有的有效信息,所以隨著所需翻譯句子的長度的增加,這種結(jié)構(gòu)的效果會顯著下降。

與此同時,解碼的目標(biāo)詞語可能只與原輸入的部分詞語有關(guān),而并不是與所有的輸入有關(guān)。例如,當(dāng)把“Hello world”翻譯成“Bonjour le monde”時,“Hello”映射成“Bonjour”,“world”映射成“monde”。在seq2seq模型中,解碼器只能隱式地從編碼器的最終狀態(tài)中選擇相應(yīng)的信息。然而,注意力機(jī)制可以將這種選擇過程顯式地建模。

Image Name

注意力機(jī)制框架

Attention 是一種通用的帶權(quán)池化方法,輸入由兩部分構(gòu)成:詢問(query)和鍵值對(key-value pairs)。??_??∈?^{??_??}, ??_??∈?^{??_??}. Query ??∈?^{??_??} , attention layer得到輸出與value的維度一致 ??∈?^{??_??}. 對于一個query來說,attention layer 會與每一個key計算注意力分?jǐn)?shù)并進(jìn)行權(quán)重的歸一化,輸出的向量o則是value的加權(quán)求和,而每個key計算的權(quán)重與value一一對應(yīng)。

為了計算輸出,我們首先假設(shè)有一個函數(shù)\alpha 用于計算query和key的相似性,然后可以計算所有的 attention scores a_1, \ldots, a_n by

a_i = \alpha(\mathbf q, \mathbf k_i).

我們使用 softmax函數(shù) 獲得注意力權(quán)重:

b_1, \ldots, b_n = \textrm{softmax}(a_1, \ldots, a_n).

最終的輸出就是value的加權(quán)求和:

\mathbf o = \sum_{i=1}^n b_i \mathbf v_i.

Image Name

不同的attetion layer的區(qū)別在于score函數(shù)的選擇,在本節(jié)的其余部分,我們將討論兩個常用的注意層 Dot-product Attention 和 Multilayer Perceptron Attention;隨后我們將實現(xiàn)一個引入attention的seq2seq模型并在英法翻譯語料上進(jìn)行訓(xùn)練與測試。

import math
import torch 
import torch.nn as nn
import os
def file_name_walk(file_dir):
    for root, dirs, files in os.walk(file_dir):
#         print("root", root)  # 當(dāng)前目錄路徑
         print("dirs", dirs)  # 當(dāng)前路徑下所有子目錄
         print("files", files)  # 當(dāng)前路徑下所有非目錄子文件

file_name_walk("/home/kesci/input/fraeng6506")
dirs []
files ['_about.txt', 'fra.txt']

Softmax屏蔽

在深入研究實現(xiàn)之前,我們首先介紹softmax操作符的一個屏蔽操作。

def SequenceMask(X, X_len,value=-1e6):
    maxlen = X.size(1)
    #print(X.size(),torch.arange((maxlen),dtype=torch.float)[None, :],'\n',X_len[:, None] )
    mask = torch.arange((maxlen),dtype=torch.float)[None, :] >= X_len[:, None]   
    #print(mask)
    X[mask]=value
    return X
def masked_softmax(X, valid_length):
    # X: 3-D tensor, valid_length: 1-D or 2-D tensor
    softmax = nn.Softmax(dim=-1)
    if valid_length is None:
        return softmax(X)
    else:
        shape = X.shape
        if valid_length.dim() == 1:
            try:
                valid_length = torch.FloatTensor(valid_length.numpy().repeat(shape[1], axis=0))#[2,2,3,3]
            except:
                valid_length = torch.FloatTensor(valid_length.cpu().numpy().repeat(shape[1], axis=0))#[2,2,3,3]
        else:
            valid_length = valid_length.reshape((-1,))
        # fill masked elements with a large negative, whose exp is 0
        X = SequenceMask(X.reshape((-1, shape[-1])), valid_length)
 
        return softmax(X).reshape(shape)
masked_softmax(torch.rand((2,2,4),dtype=torch.float), torch.FloatTensor([2,3]))
tensor([[[0.4055, 0.5945, 0.0000, 0.0000],
         [0.5396, 0.4604, 0.0000, 0.0000]],

        [[0.3266, 0.3686, 0.3048, 0.0000],
         [0.3770, 0.3882, 0.2348, 0.0000]]])

超出2維矩陣的乘法

XY 是維度分別為(b,n,m)(b, m, k)的張量,進(jìn)行 b 次二維矩陣乘法后得到 Z, 維度為 (b, n, k)

Z[i,:,:] = dot(X[i,:,:], Y[i,:,:])\qquad for\ i= 1,…,n\ .

torch.bmm(torch.ones((2,1,3), dtype = torch.float), torch.ones((2,3,2), dtype = torch.float))
tensor([[[3., 3.]],

        [[3., 3.]]])

點積注意力

The dot product 假設(shè)query和keys有相同的維度, 即 \forall i, ??,??_?? ∈ ?_??. 通過計算query和key轉(zhuǎn)置的乘積來計算attention score,通常還會除去 \sqrtmhtoc9o 減少計算出來的score對維度??的依賴性,如下

??(??,??)=???,???/ \sqrt7bscnn1

假設(shè) ??∈?^{??×??}m 個query,??∈?^{??×??}n 個keys. 我們可以通過矩陣運算的方式計算所有 mn 個score:

??(??,??)=????^??/\sqrtlvd8ovf

現(xiàn)在讓我們實現(xiàn)這個層,它支持一批查詢和鍵值對。此外,它支持作為正則化隨機(jī)刪除一些注意力權(quán)重.

# Save to the d2l package.
class DotProductAttention(nn.Module): 
    def __init__(self, dropout, **kwargs):
        super(DotProductAttention, self).__init__(**kwargs)
        self.dropout = nn.Dropout(dropout)

    # query: (batch_size, #queries, d)
    # key: (batch_size, #kv_pairs, d)
    # value: (batch_size, #kv_pairs, dim_v)
    # valid_length: either (batch_size, ) or (batch_size, xx)
    def forward(self, query, key, value, valid_length=None):
        d = query.shape[-1]
        # set transpose_b=True to swap the last two dimensions of key
        
        scores = torch.bmm(query, key.transpose(1,2)) / math.sqrt(d)
        attention_weights = self.dropout(masked_softmax(scores, valid_length))
        print("attention_weight\n",attention_weights)
        return torch.bmm(attention_weights, value)

測試

現(xiàn)在我們創(chuàng)建了兩個批,每個批有一個query和10個key-values對。我們通過valid_length指定,對于第一批,我們只關(guān)注前2個鍵-值對,而對于第二批,我們將檢查前6個鍵-值對。因此,盡管這兩個批處理具有相同的查詢和鍵值對,但我們獲得的輸出是不同的。

atten = DotProductAttention(dropout=0)

keys = torch.ones((2,10,2),dtype=torch.float)
values = torch.arange((40), dtype=torch.float).view(1,10,4).repeat(2,1,1)
atten(torch.ones((2,1,2),dtype=torch.float), keys, values, torch.FloatTensor([2, 6]))
attention_weight
 tensor([[[0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000]],

        [[0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000,
          0.0000, 0.0000]]])





tensor([[[ 2.0000,  3.0000,  4.0000,  5.0000]],

        [[10.0000, 11.0000, 12.0000, 13.0000]]])

多層感知機(jī)注意力

在多層感知器中,我們首先將 query and keys 投影到 ?^? .為了更具體,我們將可以學(xué)習(xí)的參數(shù)做如下映射
??_??∈?^{?×??_??} , ??_??∈?^{?×??_??} , and ??∈?^h . 將score函數(shù)定義
??(??,??)=??^??tanh(??_????+??_????)
.
然后將key 和 value 在特征的維度上合并(concatenate),然后送至 a single hidden layer perceptron 這層中 hidden layer 為 ? and 輸出的size為 1 .隱層激活函數(shù)為tanh,無偏置.

# Save to the d2l package.
class MLPAttention(nn.Module):  
    def __init__(self, units,ipt_dim,dropout, **kwargs):
        super(MLPAttention, self).__init__(**kwargs)
        # Use flatten=True to keep query's and key's 3-D shapes.
        self.W_k = nn.Linear(ipt_dim, units, bias=False)
        self.W_q = nn.Linear(ipt_dim, units, bias=False)
        self.v = nn.Linear(units, 1, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key, value, valid_length):
        query, key = self.W_k(query), self.W_q(key)
        #print("size",query.size(),key.size())
        # expand query to (batch_size, #querys, 1, units), and key to
        # (batch_size, 1, #kv_pairs, units). Then plus them with broadcast.
        features = query.unsqueeze(2) + key.unsqueeze(1)
        #print("features:",features.size())  #--------------開啟
        scores = self.v(features).squeeze(-1) 
        attention_weights = self.dropout(masked_softmax(scores, valid_length))
        return torch.bmm(attention_weights, value)

測試

盡管MLPAttention包含一個額外的MLP模型,但如果給定相同的輸入和相同的鍵,我們將獲得與DotProductAttention相同的輸出

atten = MLPAttention(ipt_dim=2,units = 8, dropout=0)
atten(torch.ones((2,1,2), dtype = torch.float), keys, values, torch.FloatTensor([2, 6]))
tensor([[[ 2.0000,  3.0000,  4.0000,  5.0000]],

        [[10.0000, 11.0000, 12.0000, 13.0000]]], grad_fn=<BmmBackward>)

總結(jié)

  • 注意力層顯式地選擇相關(guān)的信息。
  • 注意層的內(nèi)存由鍵-值對組成,因此它的輸出接近于鍵類似于查詢的值。

引入注意力機(jī)制的Seq2seq模型

本節(jié)中將注意機(jī)制添加到sequence to sequence 模型中,以顯式地使用權(quán)重聚合states。下圖展示encoding 和decoding的模型結(jié)構(gòu),在時間步為t的時候。此刻attention layer保存著encodering看到的所有信息——即encoding的每一步輸出。在decoding階段,解碼器的t時刻的隱藏狀態(tài)被當(dāng)作query,encoder的每個時間步的hidden states作為key和value進(jìn)行attention聚合. Attetion model的輸出當(dāng)作成上下文信息context vector,并與解碼器輸入D_t拼接起來一起送到解碼器:

Image Name

Fig1具有注意機(jī)制的seq-to-seq模型解碼的第二步

下圖展示了seq2seq機(jī)制的所以層的關(guān)系,下面展示了encoder和decoder的layer結(jié)構(gòu)

Image Name

Fig2具有注意機(jī)制的seq-to-seq模型中層結(jié)構(gòu)

import sys
sys.path.append('/home/kesci/input/d2len9900')
import d2l

解碼器

由于帶有注意機(jī)制的seq2seq的編碼器與之前章節(jié)中的Seq2SeqEncoder相同,所以在此處我們只關(guān)注解碼器。我們添加了一個MLP注意層(MLPAttention),它的隱藏大小與解碼器中的LSTM層相同。然后我們通過從編碼器傳遞三個參數(shù)來初始化解碼器的狀態(tài):

  • the encoder outputs of all timesteps:encoder輸出的各個狀態(tài),被用于attetion layer的memory部分,有相同的key和values
  • the hidden state of the encoder’s final timestep:編碼器最后一個時間步的隱藏狀態(tài),被用于初始化decoder 的hidden state
  • the encoder valid length: 編碼器的有效長度,借此,注意層不會考慮編碼器輸出中的填充標(biāo)記(Paddings)

在解碼的每個時間步,我們使用解碼器的最后一個RNN層的輸出作為注意層的query。然后,將注意力模型的輸出與輸入嵌入向量連接起來,輸入到RNN層。雖然RNN層隱藏狀態(tài)也包含來自解碼器的歷史信息,但是attention model的輸出顯式地選擇了enc_valid_len以內(nèi)的編碼器輸出,這樣attention機(jī)制就會盡可能排除其他不相關(guān)的信息。

class Seq2SeqAttentionDecoder(d2l.Decoder):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0, **kwargs):
        super(Seq2SeqAttentionDecoder, self).__init__(**kwargs)
        self.attention_cell = MLPAttention(num_hiddens,num_hiddens, dropout)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.LSTM(embed_size+ num_hiddens,num_hiddens, num_layers, dropout=dropout)
        self.dense = nn.Linear(num_hiddens,vocab_size)

    def init_state(self, enc_outputs, enc_valid_len, *args):
        outputs, hidden_state = enc_outputs
#         print("first:",outputs.size(),hidden_state[0].size(),hidden_state[1].size())
        # Transpose outputs to (batch_size, seq_len, hidden_size)
        return (outputs.permute(1,0,-1), hidden_state, enc_valid_len)
        #outputs.swapaxes(0, 1)
        
    def forward(self, X, state):
        enc_outputs, hidden_state, enc_valid_len = state
        #("X.size",X.size())
        X = self.embedding(X).transpose(0,1)
#         print("Xembeding.size2",X.size())
        outputs = []
        for l, x in enumerate(X):
#             print(f"\n{l}-th token")
#             print("x.first.size()",x.size())
            # query shape: (batch_size, 1, hidden_size)
            # select hidden state of the last rnn layer as query
            query = hidden_state[0][-1].unsqueeze(1) # np.expand_dims(hidden_state[0][-1], axis=1)
            # context has same shape as query
#             print("query enc_outputs, enc_outputs:\n",query.size(), enc_outputs.size(), enc_outputs.size())
            context = self.attention_cell(query, enc_outputs, enc_outputs, enc_valid_len)
            # Concatenate on the feature dimension
#             print("context.size:",context.size())
            x = torch.cat((context, x.unsqueeze(1)), dim=-1)
            # Reshape x to (1, batch_size, embed_size+hidden_size)
#             print("rnn",x.size(), len(hidden_state))
            out, hidden_state = self.rnn(x.transpose(0,1), hidden_state)
            outputs.append(out)
        outputs = self.dense(torch.cat(outputs, dim=0))
        return outputs.transpose(0, 1), [enc_outputs, hidden_state,
                                        enc_valid_len]

現(xiàn)在我們可以用注意力模型來測試seq2seq。為了與第9.7節(jié)中的模型保持一致,我們對vocab_size、embed_size、num_hiddens和num_layers使用相同的超參數(shù)。結(jié)果,我們得到了相同的解碼器輸出形狀,但是狀態(tài)結(jié)構(gòu)改變了。

encoder = d2l.Seq2SeqEncoder(vocab_size=10, embed_size=8,
                            num_hiddens=16, num_layers=2)
# encoder.initialize()
decoder = Seq2SeqAttentionDecoder(vocab_size=10, embed_size=8,
                                  num_hiddens=16, num_layers=2)
X = torch.zeros((4, 7),dtype=torch.long)
print("batch size=4\nseq_length=7\nhidden dim=16\nnum_layers=2\n")
print('encoder output size:', encoder(X)[0].size())
print('encoder hidden size:', encoder(X)[1][0].size())
print('encoder memory size:', encoder(X)[1][1].size())
state = decoder.init_state(encoder(X), None)
out, state = decoder(X, state)
out.shape, len(state), state[0].shape, len(state[1]), state[1][0].shape

訓(xùn)練

與第9.7.4節(jié)相似,通過應(yīng)用相同的訓(xùn)練超參數(shù)和相同的訓(xùn)練損失來嘗試一個簡單的娛樂模型。從結(jié)果中我們可以看出,由于訓(xùn)練數(shù)據(jù)集中的序列相對較短,額外的注意層并沒有帶來顯著的改進(jìn)。由于編碼器和解碼器的注意層的計算開銷,該模型比沒有注意的seq2seq模型慢得多。

import zipfile
import torch
import requests
from io import BytesIO
from torch.utils import data
import sys
import collections

class Vocab(object): # This class is saved in d2l.
  def __init__(self, tokens, min_freq=0, use_special_tokens=False):
    # sort by frequency and token
    counter = collections.Counter(tokens)
    token_freqs = sorted(counter.items(), key=lambda x: x[0])
    token_freqs.sort(key=lambda x: x[1], reverse=True)
    if use_special_tokens:
      # padding, begin of sentence, end of sentence, unknown
      self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3)
      tokens = ['', '', '', '']
    else:
      self.unk = 0
      tokens = ['']
    tokens += [token for token, freq in token_freqs if freq >= min_freq]
    self.idx_to_token = []
    self.token_to_idx = dict()
    for token in tokens:
      self.idx_to_token.append(token)
      self.token_to_idx[token] = len(self.idx_to_token) - 1
      
  def __len__(self):
    return len(self.idx_to_token)
  
  def __getitem__(self, tokens):
    if not isinstance(tokens, (list, tuple)):
      return self.token_to_idx.get(tokens, self.unk)
    else:
      return [self.__getitem__(token) for token in tokens]
    
  def to_tokens(self, indices):
    if not isinstance(indices, (list, tuple)):
      return self.idx_to_token[indices]
    else:
      return [self.idx_to_token[index] for index in indices]

def load_data_nmt(batch_size, max_len, num_examples=1000):
    """Download an NMT dataset, return its vocabulary and data iterator."""
    # Download and preprocess
    def preprocess_raw(text):
        text = text.replace('\u202f', ' ').replace('\xa0', ' ')
        out = ''
        for i, char in enumerate(text.lower()):
            if char in (',', '!', '.') and text[i-1] != ' ':
                out += ' '
            out += char
        return out 


    with open('/home/kesci/input/fraeng6506/fra.txt', 'r') as f:
      raw_text = f.read()


    text = preprocess_raw(raw_text)

    # Tokenize
    source, target = [], []
    for i, line in enumerate(text.split('\n')):
        if i >= num_examples:
            break
        parts = line.split('\t')
        if len(parts) >= 2:
            source.append(parts[0].split(' '))
            target.append(parts[1].split(' '))

    # Build vocab
    def build_vocab(tokens):
        tokens = [token for line in tokens for token in line]
        return Vocab(tokens, min_freq=3, use_special_tokens=True)
    src_vocab, tgt_vocab = build_vocab(source), build_vocab(target)

    # Convert to index arrays
    def pad(line, max_len, padding_token):
        if len(line) > max_len:
            return line[:max_len]
        return line + [padding_token] * (max_len - len(line))

    def build_array(lines, vocab, max_len, is_source):
        lines = [vocab[line] for line in lines]
        if not is_source:
            lines = [[vocab.bos] + line + [vocab.eos] for line in lines]
        array = torch.tensor([pad(line, max_len, vocab.pad) for line in lines])
        valid_len = (array != vocab.pad).sum(1)
        return array, valid_len

    src_vocab, tgt_vocab = build_vocab(source), build_vocab(target)
    src_array, src_valid_len = build_array(source, src_vocab, max_len, True)
    tgt_array, tgt_valid_len = build_array(target, tgt_vocab, max_len, False)
    train_data = data.TensorDataset(src_array, src_valid_len, tgt_array, tgt_valid_len)
    train_iter = data.DataLoader(train_data, batch_size, shuffle=True)
    return src_vocab, tgt_vocab, train_iter
embed_size, num_hiddens, num_layers, dropout = 32, 32, 2, 0.0
batch_size, num_steps = 64, 10
lr, num_epochs, ctx = 0.005, 500, d2l.try_gpu()

src_vocab, tgt_vocab, train_iter = load_data_nmt(batch_size, num_steps)
encoder = d2l.Seq2SeqEncoder(
    len(src_vocab), embed_size, num_hiddens, num_layers, dropout)
decoder = Seq2SeqAttentionDecoder(
    len(tgt_vocab), embed_size, num_hiddens, num_layers, dropout)
model = d2l.EncoderDecoder(encoder, decoder)

訓(xùn)練和預(yù)測

d2l.train_s2s_ch9(model, train_iter, lr, num_epochs, ctx)
for sentence in ['Go .', 'Good Night !', "I'm OK .", 'I won !']:
    print(sentence + ' => ' + d2l.predict_s2s_ch9(
        model, sentence, src_vocab, tgt_vocab, num_steps, ctx))

Transformer

在之前的章節(jié)中,我們已經(jīng)介紹了主流的神經(jīng)網(wǎng)絡(luò)架構(gòu)如卷積神經(jīng)網(wǎng)絡(luò)(CNNs)和循環(huán)神經(jīng)網(wǎng)絡(luò)(RNNs)。讓我們進(jìn)行一些回顧:

  • CNNs 易于并行化,卻不適合捕捉變長序列內(nèi)的依賴關(guān)系。
  • RNNs 適合捕捉長距離變長序列的依賴,但是卻難以實現(xiàn)并行化處理序列。

為了整合CNN和RNN的優(yōu)勢,[Vaswani et al., 2017] 創(chuàng)新性地使用注意力機(jī)制設(shè)計了Transformer模型。該模型利用attention機(jī)制實現(xiàn)了并行化捕捉序列依賴,并且同時處理序列的每個位置的tokens,上述優(yōu)勢使得Transformer模型在性能優(yōu)異的同時大大減少了訓(xùn)練時間。

圖10.3.1展示了Transformer模型的架構(gòu),與9.7節(jié)的seq2seq模型相似,Transformer同樣基于編碼器-解碼器架構(gòu),其區(qū)別主要在于以下三點:

  1. Transformer blocks:將seq2seq模型重的循環(huán)網(wǎng)絡(luò)替換為了Transformer Blocks,該模塊包含一個多頭注意力層(Multi-head Attention Layers)以及兩個position-wise feed-forward networks(FFN)。對于解碼器來說,另一個多頭注意力層被用于接受編碼器的隱藏狀態(tài)。
  2. Add and norm:多頭注意力層和前饋網(wǎng)絡(luò)的輸出被送到兩個“add and norm”層進(jìn)行處理,該層包含殘差結(jié)構(gòu)以及層歸一化。
  3. Position encoding:由于自注意力層并沒有區(qū)分元素的順序,所以一個位置編碼層被用于向序列元素里添加位置信息。
Fig. 10.3.1 The Transformer architecture.

Fig.10.3.1\ Transformer 架構(gòu).

在接下來的部分,我們將會帶領(lǐng)大家實現(xiàn)Transformer里全新的子結(jié)構(gòu),并且構(gòu)建一個神經(jīng)機(jī)器翻譯模型用以訓(xùn)練和測試。

import os
import math
import numpy as np
import torch 
import torch.nn as nn
import torch.nn.functional as F
import sys
sys.path.append('/home/kesci/input/d2len9900')
import d2l

以下是復(fù)制了上一小節(jié)中 masked softmax 實現(xiàn),這里就不再贅述了。

def SequenceMask(X, X_len,value=-1e6):
    maxlen = X.size(1)
    X_len = X_len.to(X.device)
    #print(X.size(),torch.arange((maxlen),dtype=torch.float)[None, :],'\n',X_len[:, None] )
    mask = torch.arange((maxlen), dtype=torch.float, device=X.device)
    mask = mask[None, :] < X_len[:, None]
    #print(mask)
    X[~mask]=value
    return X

def masked_softmax(X, valid_length):
    # X: 3-D tensor, valid_length: 1-D or 2-D tensor
    softmax = nn.Softmax(dim=-1)
    if valid_length is None:
        return softmax(X)
    else:
        shape = X.shape
        if valid_length.dim() == 1:
            try:
                valid_length = torch.FloatTensor(valid_length.numpy().repeat(shape[1], axis=0))#[2,2,3,3]
            except:
                valid_length = torch.FloatTensor(valid_length.cpu().numpy().repeat(shape[1], axis=0))#[2,2,3,3]
        else:
            valid_length = valid_length.reshape((-1,))
        # fill masked elements with a large negative, whose exp is 0
        X = SequenceMask(X.reshape((-1, shape[-1])), valid_length)
 
        return softmax(X).reshape(shape)

# Save to the d2l package.
class DotProductAttention(nn.Module): 
    def __init__(self, dropout, **kwargs):
        super(DotProductAttention, self).__init__(**kwargs)
        self.dropout = nn.Dropout(dropout)

    # query: (batch_size, #queries, d)
    # key: (batch_size, #kv_pairs, d)
    # value: (batch_size, #kv_pairs, dim_v)
    # valid_length: either (batch_size, ) or (batch_size, xx)
    def forward(self, query, key, value, valid_length=None):
        d = query.shape[-1]
        # set transpose_b=True to swap the last two dimensions of key
        scores = torch.bmm(query, key.transpose(1,2)) / math.sqrt(d)
        attention_weights = self.dropout(masked_softmax(scores, valid_length))
        return torch.bmm(attention_weights, value)

多頭注意力層

在我們討論多頭注意力層之前,先來迅速理解以下自注意力(self-attention)的結(jié)構(gòu)。自注意力模型是一個正規(guī)的注意力模型,序列的每一個元素對應(yīng)的key,value,query是完全一致的。如圖10.3.2 自注意力輸出了一個與輸入長度相同的表征序列,與循環(huán)神經(jīng)網(wǎng)絡(luò)相比,自注意力對每個元素輸出的計算是并行的,所以我們可以高效的實現(xiàn)這個模塊。

Fig. 10.3.2 自注意力結(jié)構(gòu)

Fig.10.3.2\ 自注意力結(jié)構(gòu)

多頭注意力層包含h個并行的自注意力層,每一個這種層被成為一個head。對每個頭來說,在進(jìn)行注意力計算之前,我們會將query、key和value用三個現(xiàn)行層進(jìn)行映射,這h個注意力頭的輸出將會被拼接之后輸入最后一個線性層進(jìn)行整合。

Image Name

Fig.10.3.3\ 多頭注意力

假設(shè)query,key和value的維度分別是d_qd_kd_v。那么對于每一個頭i=1,\ldots,h,我們可以訓(xùn)練相應(yīng)的模型權(quán)重W_q^{(i)} \in \mathbb{R}^{p_q\times d_q}W_k^{(i)} \in \mathbb{R}^{p_k\times d_k}W_v^{(i)} \in \mathbb{R}^{p_v\times d_v},以得到每個頭的輸出:

o^{(i)} = attention(W_q^{(i)}q, W_k^{(i)}k, W_v^{(i)}v)

這里的attention可以是任意的attention function,比如前一節(jié)介紹的dot-product attention以及MLP attention。之后我們將所有head對應(yīng)的輸出拼接起來,送入最后一個線性層進(jìn)行整合,這個層的權(quán)重可以表示為W_o\in \mathbb{R}^{d_0 \times hp_v}

o = W_o[o^{(1)}, \ldots, o^{(h)}]

接下來我們就可以來實現(xiàn)多頭注意力了,假設(shè)我們有h個頭,隱藏層權(quán)重 hidden\_size = p_q = p_k = p_v 與query,key,value的維度一致。除此之外,因為多頭注意力層保持輸入與輸出張量的維度不變,所以輸出feature的維度也設(shè)置為 d_0 = hidden\_size

class MultiHeadAttention(nn.Module):
    def __init__(self, input_size, hidden_size, num_heads, dropout, **kwargs):
        super(MultiHeadAttention, self).__init__(**kwargs)
        self.num_heads = num_heads
        self.attention = DotProductAttention(dropout)
        self.W_q = nn.Linear(input_size, hidden_size, bias=False)
        self.W_k = nn.Linear(input_size, hidden_size, bias=False)
        self.W_v = nn.Linear(input_size, hidden_size, bias=False)
        self.W_o = nn.Linear(hidden_size, hidden_size, bias=False)
    
    def forward(self, query, key, value, valid_length):
        # query, key, and value shape: (batch_size, seq_len, dim),
        # where seq_len is the length of input sequence
        # valid_length shape is either (batch_size, )
        # or (batch_size, seq_len).

        # Project and transpose query, key, and value from
        # (batch_size, seq_len, hidden_size * num_heads) to
        # (batch_size * num_heads, seq_len, hidden_size).
        
        query = transpose_qkv(self.W_q(query), self.num_heads)
        key = transpose_qkv(self.W_k(key), self.num_heads)
        value = transpose_qkv(self.W_v(value), self.num_heads)
        
        if valid_length is not None:
            # Copy valid_length by num_heads times
            device = valid_length.device
            valid_length = valid_length.cpu().numpy() if valid_length.is_cuda else valid_length.numpy()
            if valid_length.ndim == 1:
                valid_length = torch.FloatTensor(np.tile(valid_length, self.num_heads))
            else:
                valid_length = torch.FloatTensor(np.tile(valid_length, (self.num_heads,1)))

            valid_length = valid_length.to(device)
            
        output = self.attention(query, key, value, valid_length)
        output_concat = transpose_output(output, self.num_heads)
        return self.W_o(output_concat)
def transpose_qkv(X, num_heads):
    # Original X shape: (batch_size, seq_len, hidden_size * num_heads),
    # -1 means inferring its value, after first reshape, X shape:
    # (batch_size, seq_len, num_heads, hidden_size)
    X = X.view(X.shape[0], X.shape[1], num_heads, -1)
    
    # After transpose, X shape: (batch_size, num_heads, seq_len, hidden_size)
    X = X.transpose(2, 1).contiguous()

    # Merge the first two dimensions. Use reverse=True to infer shape from
    # right to left.
    # output shape: (batch_size * num_heads, seq_len, hidden_size)
    output = X.view(-1, X.shape[2], X.shape[3])
    return output


# Saved in the d2l package for later use
def transpose_output(X, num_heads):
    # A reversed version of transpose_qkv
    X = X.view(-1, num_heads, X.shape[1], X.shape[2])
    X = X.transpose(2, 1).contiguous()
    return X.view(X.shape[0], X.shape[1], -1)
cell = MultiHeadAttention(5, 9, 3, 0.5)
X = torch.ones((2, 4, 5))
valid_length = torch.FloatTensor([2, 3])
cell(X, X, X, valid_length).shape

基于位置的前饋網(wǎng)絡(luò)

Transformer 模塊另一個非常重要的部分就是基于位置的前饋網(wǎng)絡(luò)(FFN),它接受一個形狀為(batch_size,seq_length, feature_size)的三維張量。Position-wise FFN由兩個全連接層組成,他們作用在最后一維上。因為序列的每個位置的狀態(tài)都會被單獨地更新,所以我們稱他為position-wise,這等效于一個1x1的卷積。

下面我們來實現(xiàn)PositionWiseFFN:

# Save to the d2l package.
class PositionWiseFFN(nn.Module):
    def __init__(self, input_size, ffn_hidden_size, hidden_size_out, **kwargs):
        super(PositionWiseFFN, self).__init__(**kwargs)
        self.ffn_1 = nn.Linear(input_size, ffn_hidden_size)
        self.ffn_2 = nn.Linear(ffn_hidden_size, hidden_size_out)
        
        
    def forward(self, X):
        return self.ffn_2(F.relu(self.ffn_1(X)))

與多頭注意力層相似,F(xiàn)FN層同樣只會對最后一維的大小進(jìn)行改變;除此之外,對于兩個完全相同的輸入,F(xiàn)FN層的輸出也將相等。

ffn = PositionWiseFFN(4, 4, 8)
out = ffn(torch.ones((2,3,4)))

print(out, out.shape)

Add and Norm

除了上面兩個模塊之外,Transformer還有一個重要的相加歸一化層,它可以平滑地整合輸入和其他層的輸出,因此我們在每個多頭注意力層和FFN層后面都添加一個含殘差連接的Layer Norm層。這里 Layer Norm 與7.5小節(jié)的Batch Norm很相似,唯一的區(qū)別在于Batch Norm是對于batch size這個維度進(jìn)行計算均值和方差的,而Layer Norm則是對最后一維進(jìn)行計算。層歸一化可以防止層內(nèi)的數(shù)值變化過大,從而有利于加快訓(xùn)練速度并且提高泛化性能。 (ref)

layernorm = nn.LayerNorm(normalized_shape=2, elementwise_affine=True)
batchnorm = nn.BatchNorm1d(num_features=2, affine=True)
X = torch.FloatTensor([[1,2], [3,4]])
print('layer norm:', layernorm(X))
print('batch norm:', batchnorm(X))
# Save to the d2l package.
class AddNorm(nn.Module):
    def __init__(self, hidden_size, dropout, **kwargs):
        super(AddNorm, self).__init__(**kwargs)
        self.dropout = nn.Dropout(dropout)
        self.norm = nn.LayerNorm(hidden_size)
    
    def forward(self, X, Y):
        return self.norm(self.dropout(Y) + X)

由于殘差連接,X和Y需要有相同的維度。

add_norm = AddNorm(4, 0.5)
add_norm(torch.ones((2,3,4)), torch.ones((2,3,4))).shape

位置編碼

與循環(huán)神經(jīng)網(wǎng)絡(luò)不同,無論是多頭注意力網(wǎng)絡(luò)還是前饋神經(jīng)網(wǎng)絡(luò)都是獨立地對每個位置的元素進(jìn)行更新,這種特性幫助我們實現(xiàn)了高效的并行,卻丟失了重要的序列順序的信息。為了更好的捕捉序列信息,Transformer模型引入了位置編碼去保持輸入序列元素的位置。

假設(shè)輸入序列的嵌入表示 X\in \mathbb{R}^{l\times d}, 序列長度為l嵌入向量維度為d,則其位置編碼為P \in \mathbb{R}^{l\times d} ,輸出的向量就是二者相加 X + P

位置編碼是一個二維的矩陣,i對應(yīng)著序列中的順序,j對應(yīng)其embedding vector內(nèi)部的維度索引。我們可以通過以下等式計算位置編碼:

P_{i,2j} = sin(i/10000^{2j/d})

P_{i,2j+1} = cos(i/10000^{2j/d})

for\ i=0,\ldots, l-1\ and\ j=0,\ldots,\lfloor (d-1)/2 \rfloor

Image Name

Fig. 10.3.4\ 位置編碼

class PositionalEncoding(nn.Module):
    def __init__(self, embedding_size, dropout, max_len=1000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.P = np.zeros((1, max_len, embedding_size))
        X = np.arange(0, max_len).reshape(-1, 1) / np.power(
            10000, np.arange(0, embedding_size, 2)/embedding_size)
        self.P[:, :, 0::2] = np.sin(X)
        self.P[:, :, 1::2] = np.cos(X)
        self.P = torch.FloatTensor(self.P)
    
    def forward(self, X):
        if X.is_cuda and not self.P.is_cuda:
            self.P = self.P.cuda()
        X = X + self.P[:, :X.shape[1], :]
        return self.dropout(X)

測試

下面我們用PositionalEncoding這個類進(jìn)行一個小測試,取其中的四個維度進(jìn)行可視化。 我們可以看到,第4維和第5維有相同的頻率但偏置不同。第6維和第7維具有更低的頻率;因此positional encoding對于不同維度具有可區(qū)分性。

import numpy as np
pe = PositionalEncoding(20, 0)
Y = pe(torch.zeros((1, 100, 20))).numpy()
d2l.plot(np.arange(100), Y[0, :, 4:8].T, figsize=(6, 2.5),
         legend=["dim %d" % p for p in [4, 5, 6, 7]])

編碼器

我們已經(jīng)有了組成Transformer的各個模塊,現(xiàn)在我們可以開始搭建了!編碼器包含一個多頭注意力層,一個position-wise FFN,和兩個 Add and Norm層。對于attention模型以及FFN模型,我們的輸出維度都是與embedding維度一致的,這也是由于殘差連接天生的特性導(dǎo)致的,因為我們要將前一層的輸出與原始輸入相加并歸一化。

class EncoderBlock(nn.Module):
    def __init__(self, embedding_size, ffn_hidden_size, num_heads,
                 dropout, **kwargs):
        super(EncoderBlock, self).__init__(**kwargs)
        self.attention = MultiHeadAttention(embedding_size, embedding_size, num_heads, dropout)
        self.addnorm_1 = AddNorm(embedding_size, dropout)
        self.ffn = PositionWiseFFN(embedding_size, ffn_hidden_size, embedding_size)
        self.addnorm_2 = AddNorm(embedding_size, dropout)

    def forward(self, X, valid_length):
        Y = self.addnorm_1(X, self.attention(X, X, X, valid_length))
        return self.addnorm_2(Y, self.ffn(Y))
# batch_size = 2, seq_len = 100, embedding_size = 24
# ffn_hidden_size = 48, num_head = 8, dropout = 0.5

X = torch.ones((2, 100, 24))
encoder_blk = EncoderBlock(24, 48, 8, 0.5)
encoder_blk(X, valid_length).shape

現(xiàn)在我們來實現(xiàn)整個Transformer 編碼器模型,整個編碼器由n個剛剛定義的Encoder Block堆疊而成,因為殘差連接的緣故,中間狀態(tài)的維度始終與嵌入向量的維度d一致;同時注意到我們把嵌入向量乘以 \sqrti4jst94 以防止其值過小。

class TransformerEncoder(d2l.Encoder):
    def __init__(self, vocab_size, embedding_size, ffn_hidden_size,
                 num_heads, num_layers, dropout, **kwargs):
        super(TransformerEncoder, self).__init__(**kwargs)
        self.embedding_size = embedding_size
        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.pos_encoding = PositionalEncoding(embedding_size, dropout)
        self.blks = nn.ModuleList()
        for i in range(num_layers):
            self.blks.append(
                EncoderBlock(embedding_size, ffn_hidden_size,
                             num_heads, dropout))

    def forward(self, X, valid_length, *args):
        X = self.pos_encoding(self.embed(X) * math.sqrt(self.embedding_size))
        for blk in self.blks:
            X = blk(X, valid_length)
        return X
# test encoder
encoder = TransformerEncoder(200, 24, 48, 8, 2, 0.5)
encoder(torch.ones((2, 100)).long(), valid_length).shape

解碼器

Transformer 模型的解碼器與編碼器結(jié)構(gòu)類似,然而,除了之前介紹的幾個模塊之外,編碼器部分有另一個子模塊。該模塊也是多頭注意力層,接受編碼器的輸出作為key和value,decoder的狀態(tài)作為query。與編碼器部分相類似,解碼器同樣是使用了add and norm機(jī)制,用殘差和層歸一化將各個子層的輸出相連。

仔細(xì)來講,在第t個時間步,當(dāng)前輸入x_t是query,那么self attention接受了第t步以及前t-1步的所有輸入x_1,\ldots, x_{t-1}。在訓(xùn)練時,由于第t位置的輸入可以觀測到全部的序列,這與預(yù)測階段的情形項矛盾,所以我們要通過將第t個時間步所對應(yīng)的可觀測長度設(shè)置為t,以消除不需要看到的未來的信息。

Image Name
class DecoderBlock(nn.Module):
    def __init__(self, embedding_size, ffn_hidden_size, num_heads,dropout,i,**kwargs):
        super(DecoderBlock, self).__init__(**kwargs)
        self.i = i
        self.attention_1 = MultiHeadAttention(embedding_size, embedding_size, num_heads, dropout)
        self.addnorm_1 = AddNorm(embedding_size, dropout)
        self.attention_2 = MultiHeadAttention(embedding_size, embedding_size, num_heads, dropout)
        self.addnorm_2 = AddNorm(embedding_size, dropout)
        self.ffn = PositionWiseFFN(embedding_size, ffn_hidden_size, embedding_size)
        self.addnorm_3 = AddNorm(embedding_size, dropout)
    
    def forward(self, X, state):
        enc_outputs, enc_valid_length = state[0], state[1]
        
        # state[2][self.i] stores all the previous t-1 query state of layer-i
        # len(state[2]) = num_layers
        
        # If training:
        #     state[2] is useless.
        # If predicting:
        #     In the t-th timestep:
        #         state[2][self.i].shape = (batch_size, t-1, hidden_size)
        # Demo:
        # love dogs ! [EOS]
        #  |    |   |   |
        #   Transformer 
        #    Decoder
        #  |   |   |   |
        #  I love dogs !
        
        if state[2][self.i] is None:
            key_values = X
        else:
            # shape of key_values = (batch_size, t, hidden_size)
            key_values = torch.cat((state[2][self.i], X), dim=1) 
        state[2][self.i] = key_values
        
        if self.training:
            batch_size, seq_len, _ = X.shape
            # Shape: (batch_size, seq_len), the values in the j-th column are j+1
            valid_length = torch.FloatTensor(np.tile(np.arange(1, seq_len+1), (batch_size, 1))) 
            valid_length = valid_length.to(X.device)
        else:
            valid_length = None

        X2 = self.attention_1(X, key_values, key_values, valid_length)
        Y = self.addnorm_1(X, X2)
        Y2 = self.attention_2(Y, enc_outputs, enc_outputs, enc_valid_length)
        Z = self.addnorm_2(Y, Y2)
        return self.addnorm_3(Z, self.ffn(Z)), state
decoder_blk = DecoderBlock(24, 48, 8, 0.5, 0)
X = torch.ones((2, 100, 24))
state = [encoder_blk(X, valid_length), valid_length, [None]]
decoder_blk(X, state)[0].shape

對于Transformer解碼器來說,構(gòu)造方式與編碼器一樣,除了最后一層添加一個dense layer以獲得輸出的置信度分?jǐn)?shù)。下面讓我們來實現(xiàn)一下Transformer Decoder,除了常規(guī)的超參數(shù)例如vocab_size embedding_size 之外,解碼器還需要編碼器的輸出 enc_outputs 和句子有效長度 enc_valid_length。

class TransformerDecoder(d2l.Decoder):
    def __init__(self, vocab_size, embedding_size, ffn_hidden_size,
                 num_heads, num_layers, dropout, **kwargs):
        super(TransformerDecoder, self).__init__(**kwargs)
        self.embedding_size = embedding_size
        self.num_layers = num_layers
        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.pos_encoding = PositionalEncoding(embedding_size, dropout)
        self.blks = nn.ModuleList()
        for i in range(num_layers):
            self.blks.append(
                DecoderBlock(embedding_size, ffn_hidden_size, num_heads,
                             dropout, i))
        self.dense = nn.Linear(embedding_size, vocab_size)

    def init_state(self, enc_outputs, enc_valid_length, *args):
        return [enc_outputs, enc_valid_length, [None]*self.num_layers]

    def forward(self, X, state):
        X = self.pos_encoding(self.embed(X) * math.sqrt(self.embedding_size))
        for blk in self.blks:
            X, state = blk(X, state)
        return self.dense(X), state

訓(xùn)練

import zipfile
import torch
import requests
from io import BytesIO
from torch.utils import data
import sys
import collections

class Vocab(object): # This class is saved in d2l.
  def __init__(self, tokens, min_freq=0, use_special_tokens=False):
    # sort by frequency and token
    counter = collections.Counter(tokens)
    token_freqs = sorted(counter.items(), key=lambda x: x[0])
    token_freqs.sort(key=lambda x: x[1], reverse=True)
    if use_special_tokens:
      # padding, begin of sentence, end of sentence, unknown
      self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3)
      tokens = ['', '', '', '']
    else:
      self.unk = 0
      tokens = ['']
    tokens += [token for token, freq in token_freqs if freq >= min_freq]
    self.idx_to_token = []
    self.token_to_idx = dict()
    for token in tokens:
      self.idx_to_token.append(token)
      self.token_to_idx[token] = len(self.idx_to_token) - 1
      
  def __len__(self):
    return len(self.idx_to_token)
  
  def __getitem__(self, tokens):
    if not isinstance(tokens, (list, tuple)):
      return self.token_to_idx.get(tokens, self.unk)
    else:
      return [self.__getitem__(token) for token in tokens]
    
  def to_tokens(self, indices):
    if not isinstance(indices, (list, tuple)):
      return self.idx_to_token[indices]
    else:
      return [self.idx_to_token[index] for index in indices]

def load_data_nmt(batch_size, max_len, num_examples=1000):
    """Download an NMT dataset, return its vocabulary and data iterator."""
    # Download and preprocess
    def preprocess_raw(text):
        text = text.replace('\u202f', ' ').replace('\xa0', ' ')
        out = ''
        for i, char in enumerate(text.lower()):
            if char in (',', '!', '.') and text[i-1] != ' ':
                out += ' '
            out += char
        return out 


    with open('/home/kesci/input/fraeng6506/fra.txt', 'r') as f:
      raw_text = f.read()


    text = preprocess_raw(raw_text)

    # Tokenize
    source, target = [], []
    for i, line in enumerate(text.split('\n')):
        if i >= num_examples:
            break
        parts = line.split('\t')
        if len(parts) >= 2:
            source.append(parts[0].split(' '))
            target.append(parts[1].split(' '))

    # Build vocab
    def build_vocab(tokens):
        tokens = [token for line in tokens for token in line]
        return Vocab(tokens, min_freq=3, use_special_tokens=True)
    src_vocab, tgt_vocab = build_vocab(source), build_vocab(target)

    # Convert to index arrays
    def pad(line, max_len, padding_token):
        if len(line) > max_len:
            return line[:max_len]
        return line + [padding_token] * (max_len - len(line))

    def build_array(lines, vocab, max_len, is_source):
        lines = [vocab[line] for line in lines]
        if not is_source:
            lines = [[vocab.bos] + line + [vocab.eos] for line in lines]
        array = torch.tensor([pad(line, max_len, vocab.pad) for line in lines])
        valid_len = (array != vocab.pad).sum(1)
        return array, valid_len

    src_vocab, tgt_vocab = build_vocab(source), build_vocab(target)
    src_array, src_valid_len = build_array(source, src_vocab, max_len, True)
    tgt_array, tgt_valid_len = build_array(target, tgt_vocab, max_len, False)
    train_data = data.TensorDataset(src_array, src_valid_len, tgt_array, tgt_valid_len)
    train_iter = data.DataLoader(train_data, batch_size, shuffle=True)
    return src_vocab, tgt_vocab, train_iter
import os

import d2l

# 平臺暫時不支持gpu,現(xiàn)在會自動使用cpu訓(xùn)練,gpu可以用了之后會使用gpu來訓(xùn)練
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

embed_size, embedding_size, num_layers, dropout = 32, 32, 2, 0.05
batch_size, num_steps = 64, 10
lr, num_epochs, ctx = 0.005, 250, d2l.try_gpu()
print(ctx)
num_hiddens, num_heads = 64, 4

src_vocab, tgt_vocab, train_iter = load_data_nmt(batch_size, num_steps)

encoder = TransformerEncoder(
    len(src_vocab), embedding_size, num_hiddens, num_heads, num_layers,
    dropout)
decoder = TransformerDecoder(
    len(src_vocab), embedding_size, num_hiddens, num_heads, num_layers,
    dropout)
model = d2l.EncoderDecoder(encoder, decoder)
d2l.train_s2s_ch9(model, train_iter, lr, num_epochs, ctx)
model.eval()
for sentence in ['Go .', 'Wow !', "I'm OK .", 'I won !']:
    print(sentence + ' => ' + d2l.predict_s2s_ch9(
        model, sentence, src_vocab, tgt_vocab, num_steps, ctx))
print("END")
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

推薦閱讀更多精彩內(nèi)容