機(jī)器翻譯和數(shù)據(jù)集
機(jī)器翻譯(MT):將一段文本從一種語言自動翻譯為另一種語言,用神經(jīng)網(wǎng)絡(luò)解決這個問題通常稱為神經(jīng)機(jī)器翻譯(NMT)。
主要特征:輸出是單詞序列而不是單個單詞。 輸出序列的長度可能與源序列的長度不同。
import os
os.listdir('/home/kesci/input/')
['d2lzh1981',
'fraeng6506',
'houseprices2807',
'd2l9528',
'FashionMNIST2065',
'jaychou_lyrics4703',
'd2l_jay9460']
import sys
sys.path.append('/home/kesci/input/d2l9528/')
import collections
import d2l
import zipfile
from d2l.data.base import Vocab
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils import data
from torch import optim
數(shù)據(jù)預(yù)處理
將數(shù)據(jù)集清洗、轉(zhuǎn)化為神經(jīng)網(wǎng)絡(luò)的輸入minbatch
with open('/home/kesci/input/fraeng6506/fra.txt', 'r') as f:
raw_text = f.read()
print(raw_text[0:1000])
def preprocess_raw(text):
text = text.replace('\u202f', ' ').replace('\xa0', ' ')
out = ''
for i, char in enumerate(text.lower()):
if char in (',', '!', '.') and i > 0 and text[i-1] != ' ':
out += ' '
out += char
return out
text = preprocess_raw(raw_text)
print(text[0:1000])
字符在計算機(jī)里是以編碼的形式存在,我們通常所用的空格是 \x20 ,是在標(biāo)準(zhǔn)ASCII可見字符 0x20~0x7e 范圍內(nèi)。
而 \xa0 屬于 latin1 (ISO/IEC_8859-1)中的擴(kuò)展字符集字符,代表不間斷空白符nbsp(non-breaking space),超出gbk編碼范圍,是需要去除的特殊字符。再數(shù)據(jù)預(yù)處理的過程中,我們首先需要對數(shù)據(jù)進(jìn)行清洗。
分詞
字符串---單詞組成的列表
num_examples = 50000
source, target = [], []
for i, line in enumerate(text.split('\n')):
if i > num_examples:
break
parts = line.split('\t')
if len(parts) >= 2:
source.append(parts[0].split(' '))
target.append(parts[1].split(' '))
source[0:3], target[0:3]
d2l.set_figsize()
d2l.plt.hist([[len(l) for l in source], [len(l) for l in target]],label=['source', 'target'])
d2l.plt.legend(loc='upper right');
建立詞典
單詞組成的列表---單詞id組成的列表
def build_vocab(tokens):
tokens = [token for line in tokens for token in line]
return d2l.data.base.Vocab(tokens, min_freq=3, use_special_tokens=True)
src_vocab = build_vocab(source)
len(src_vocab)
載入數(shù)據(jù)集
def pad(line, max_len, padding_token):
if len(line) > max_len:
return line[:max_len]
return line + [padding_token] * (max_len - len(line))
pad(src_vocab[source[0]], 10, src_vocab.pad)
def build_array(lines, vocab, max_len, is_source):
lines = [vocab[line] for line in lines]
if not is_source:
lines = [[vocab.bos] + line + [vocab.eos] for line in lines]
array = torch.tensor([pad(line, max_len, vocab.pad) for line in lines])
valid_len = (array != vocab.pad).sum(1) #第一個維度
return array, valid_len
def load_data_nmt(batch_size, max_len): # This function is saved in d2l.
src_vocab, tgt_vocab = build_vocab(source), build_vocab(target)
src_array, src_valid_len = build_array(source, src_vocab, max_len, True)
tgt_array, tgt_valid_len = build_array(target, tgt_vocab, max_len, False)
train_data = data.TensorDataset(src_array, src_valid_len, tgt_array, tgt_valid_len)
train_iter = data.DataLoader(train_data, batch_size, shuffle=True)
return src_vocab, tgt_vocab, train_iter
src_vocab, tgt_vocab, train_iter = load_data_nmt(batch_size=2, max_len=8)
for X, X_valid_len, Y, Y_valid_len, in train_iter:
print('X =', X.type(torch.int32), '\nValid lengths for X =', X_valid_len,
'\nY =', Y.type(torch.int32), '\nValid lengths for Y =', Y_valid_len)
break
Encoder-Decoder
encoder:輸入到隱藏狀態(tài)
decoder:隱藏狀態(tài)到輸出
class Encoder(nn.Module):
def __init__(self, **kwargs):
super(Encoder, self).__init__(**kwargs)
def forward(self, X, *args):
raise NotImplementedError
class Decoder(nn.Module):
def __init__(self, **kwargs):
super(Decoder, self).__init__(**kwargs)
def init_state(self, enc_outputs, *args):
raise NotImplementedError
def forward(self, X, state):
raise NotImplementedError
class EncoderDecoder(nn.Module):
def __init__(self, encoder, decoder, **kwargs):
super(EncoderDecoder, self).__init__(**kwargs)
self.encoder = encoder
self.decoder = decoder
def forward(self, enc_X, dec_X, *args):
enc_outputs = self.encoder(enc_X, *args)
dec_state = self.decoder.init_state(enc_outputs, *args)
return self.decoder(dec_X, dec_state)
可以應(yīng)用在對話系統(tǒng)、生成式任務(wù)中。
Sequence to Sequence模型
模型:
訓(xùn)練
預(yù)測
具體結(jié)構(gòu):
Encoder
class Seq2SeqEncoder(d2l.Encoder):
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0, **kwargs):
super(Seq2SeqEncoder, self).__init__(**kwargs)
self.num_hiddens=num_hiddens
self.num_layers=num_layers
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.LSTM(embed_size,num_hiddens, num_layers, dropout=dropout)
def begin_state(self, batch_size, device):
return [torch.zeros(size=(self.num_layers, batch_size, self.num_hiddens), device=device),
torch.zeros(size=(self.num_layers, batch_size, self.num_hiddens), device=device)]
def forward(self, X, *args):
X = self.embedding(X) # X shape: (batch_size, seq_len, embed_size)
X = X.transpose(0, 1) # RNN needs first axes to be time
# state = self.begin_state(X.shape[1], device=X.device)
out, state = self.rnn(X)
# The shape of out is (seq_len, batch_size, num_hiddens).
# state contains the hidden state and the memory cell
# of the last time step, the shape is (num_layers, batch_size, num_hiddens)
return out, state
encoder = Seq2SeqEncoder(vocab_size=10, embed_size=8,num_hiddens=16, num_layers=2)
X = torch.zeros((4, 7),dtype=torch.long)
output, state = encoder(X)
output.shape, len(state), state[0].shape, state[1].shape
Decoder
class Seq2SeqDecoder(d2l.Decoder):
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0, **kwargs):
super(Seq2SeqDecoder, self).__init__(**kwargs)
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.LSTM(embed_size,num_hiddens, num_layers, dropout=dropout)
self.dense = nn.Linear(num_hiddens,vocab_size)
def init_state(self, enc_outputs, *args):
return enc_outputs[1]
def forward(self, X, state):
X = self.embedding(X).transpose(0, 1)
out, state = self.rnn(X, state)
# Make the batch to be the first dimension to simplify loss computation.
out = self.dense(out).transpose(0, 1)
return out, state
decoder = Seq2SeqDecoder(vocab_size=10, embed_size=8,num_hiddens=16, num_layers=2)
state = decoder.init_state(encoder(X))
out, state = decoder(X, state)
out.shape, len(state), state[0].shape, state[1].shape
損失函數(shù)
def SequenceMask(X, X_len,value=0):
maxlen = X.size(1)
mask = torch.arange(maxlen)[None, :].to(X_len.device) < X_len[:, None]
X[~mask]=value
return X
X = torch.tensor([[1,2,3], [4,5,6]])
SequenceMask(X,torch.tensor([1,2]))
X = torch.ones((2,3, 4))
SequenceMask(X, torch.tensor([1,2]),value=-1)
class MaskedSoftmaxCELoss(nn.CrossEntropyLoss):
# pred shape: (batch_size, seq_len, vocab_size)
# label shape: (batch_size, seq_len)
# valid_length shape: (batch_size, )
def forward(self, pred, label, valid_length):
# the sample weights shape should be (batch_size, seq_len)
weights = torch.ones_like(label)
weights = SequenceMask(weights, valid_length).float()
self.reduction='none'
output=super(MaskedSoftmaxCELoss, self).forward(pred.transpose(1,2), label)
return (output*weights).mean(dim=1)
loss = MaskedSoftmaxCELoss()
loss(torch.ones((3, 4, 10)), torch.ones((3,4),dtype=torch.long), torch.tensor([4,3,0]))
訓(xùn)練
def train_ch7(model, data_iter, lr, num_epochs, device): # Saved in d2l
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
loss = MaskedSoftmaxCELoss()
tic = time.time()
for epoch in range(1, num_epochs+1):
l_sum, num_tokens_sum = 0.0, 0.0
for batch in data_iter:
optimizer.zero_grad()
X, X_vlen, Y, Y_vlen = [x.to(device) for x in batch]
Y_input, Y_label, Y_vlen = Y[:,:-1], Y[:,1:], Y_vlen-1
Y_hat, _ = model(X, Y_input, X_vlen, Y_vlen)
l = loss(Y_hat, Y_label, Y_vlen).sum()
l.backward()
with torch.no_grad():
d2l.grad_clipping_nn(model, 5, device)
num_tokens = Y_vlen.sum().item()
optimizer.step()
l_sum += l.sum().item()
num_tokens_sum += num_tokens
if epoch % 50 == 0:
print("epoch {0:4d},loss {1:.3f}, time {2:.1f} sec".format(
epoch, (l_sum/num_tokens_sum), time.time()-tic))
tic = time.time()
embed_size, num_hiddens, num_layers, dropout = 32, 32, 2, 0.0
batch_size, num_examples, max_len = 64, 1e3, 10
lr, num_epochs, ctx = 0.005, 300, d2l.try_gpu()
src_vocab, tgt_vocab, train_iter = d2l.load_data_nmt(
batch_size, max_len,num_examples)
encoder = Seq2SeqEncoder(
len(src_vocab), embed_size, num_hiddens, num_layers, dropout)
decoder = Seq2SeqDecoder(
len(tgt_vocab), embed_size, num_hiddens, num_layers, dropout)
model = d2l.EncoderDecoder(encoder, decoder)
train_ch7(model, train_iter, lr, num_epochs, ctx)
測試
def translate_ch7(model, src_sentence, src_vocab, tgt_vocab, max_len, device):
src_tokens = src_vocab[src_sentence.lower().split(' ')]
src_len = len(src_tokens)
if src_len < max_len:
src_tokens += [src_vocab.pad] * (max_len - src_len)
enc_X = torch.tensor(src_tokens, device=device)
enc_valid_length = torch.tensor([src_len], device=device)
# use expand_dim to add the batch_size dimension.
enc_outputs = model.encoder(enc_X.unsqueeze(dim=0), enc_valid_length)
dec_state = model.decoder.init_state(enc_outputs, enc_valid_length)
dec_X = torch.tensor([tgt_vocab.bos], device=device).unsqueeze(dim=0)
predict_tokens = []
for _ in range(max_len):
Y, dec_state = model.decoder(dec_X, dec_state)
# The token with highest score is used as the next time step input.
dec_X = Y.argmax(dim=2)
py = dec_X.squeeze(dim=0).int().item()
if py == tgt_vocab.eos:
break
predict_tokens.append(py)
return ' '.join(tgt_vocab.to_tokens(predict_tokens))
for sentence in ['Go .', 'Wow !', "I'm OK .", 'I won !']:
print(sentence + ' => ' + translate_ch7(
model, sentence, src_vocab, tgt_vocab, max_len, ctx))
Beam Search
簡單greedy search:
維特比算法:選擇整體分?jǐn)?shù)最高的句子(搜索空間太大)
集束搜索:
print("END")
注意力機(jī)制
在“編碼器—解碼器(seq2seq)”?節(jié)?,解碼器在各個時間步依賴相同的背景變量(context vector)來獲取輸?序列信息。當(dāng)編碼器為循環(huán)神經(jīng)?絡(luò)時,背景變量來?它最終時間步的隱藏狀態(tài)。將源序列輸入信息以循環(huán)單位狀態(tài)編碼,然后將其傳遞給解碼器以生成目標(biāo)序列。然而這種結(jié)構(gòu)存在著問題,尤其是RNN機(jī)制實際中存在長程梯度消失的問題,對于較長的句子,我們很難寄希望于將輸入的序列轉(zhuǎn)化為定長的向量而保存所有的有效信息,所以隨著所需翻譯句子的長度的增加,這種結(jié)構(gòu)的效果會顯著下降。
與此同時,解碼的目標(biāo)詞語可能只與原輸入的部分詞語有關(guān),而并不是與所有的輸入有關(guān)。例如,當(dāng)把“Hello world”翻譯成“Bonjour le monde”時,“Hello”映射成“Bonjour”,“world”映射成“monde”。在seq2seq模型中,解碼器只能隱式地從編碼器的最終狀態(tài)中選擇相應(yīng)的信息。然而,注意力機(jī)制可以將這種選擇過程顯式地建模。
注意力機(jī)制框架
Attention 是一種通用的帶權(quán)池化方法,輸入由兩部分構(gòu)成:詢問(query)和鍵值對(key-value pairs)。. Query
, attention layer得到輸出與value的維度一致
. 對于一個query來說,attention layer 會與每一個key計算注意力分?jǐn)?shù)并進(jìn)行權(quán)重的歸一化,輸出的向量
則是value的加權(quán)求和,而每個key計算的權(quán)重與value一一對應(yīng)。
為了計算輸出,我們首先假設(shè)有一個函數(shù) 用于計算query和key的相似性,然后可以計算所有的 attention scores
by
我們使用 softmax函數(shù) 獲得注意力權(quán)重:
最終的輸出就是value的加權(quán)求和:
不同的attetion layer的區(qū)別在于score函數(shù)的選擇,在本節(jié)的其余部分,我們將討論兩個常用的注意層 Dot-product Attention 和 Multilayer Perceptron Attention;隨后我們將實現(xiàn)一個引入attention的seq2seq模型并在英法翻譯語料上進(jìn)行訓(xùn)練與測試。
import math
import torch
import torch.nn as nn
import os
def file_name_walk(file_dir):
for root, dirs, files in os.walk(file_dir):
# print("root", root) # 當(dāng)前目錄路徑
print("dirs", dirs) # 當(dāng)前路徑下所有子目錄
print("files", files) # 當(dāng)前路徑下所有非目錄子文件
file_name_walk("/home/kesci/input/fraeng6506")
dirs []
files ['_about.txt', 'fra.txt']
Softmax屏蔽
在深入研究實現(xiàn)之前,我們首先介紹softmax操作符的一個屏蔽操作。
def SequenceMask(X, X_len,value=-1e6):
maxlen = X.size(1)
#print(X.size(),torch.arange((maxlen),dtype=torch.float)[None, :],'\n',X_len[:, None] )
mask = torch.arange((maxlen),dtype=torch.float)[None, :] >= X_len[:, None]
#print(mask)
X[mask]=value
return X
def masked_softmax(X, valid_length):
# X: 3-D tensor, valid_length: 1-D or 2-D tensor
softmax = nn.Softmax(dim=-1)
if valid_length is None:
return softmax(X)
else:
shape = X.shape
if valid_length.dim() == 1:
try:
valid_length = torch.FloatTensor(valid_length.numpy().repeat(shape[1], axis=0))#[2,2,3,3]
except:
valid_length = torch.FloatTensor(valid_length.cpu().numpy().repeat(shape[1], axis=0))#[2,2,3,3]
else:
valid_length = valid_length.reshape((-1,))
# fill masked elements with a large negative, whose exp is 0
X = SequenceMask(X.reshape((-1, shape[-1])), valid_length)
return softmax(X).reshape(shape)
masked_softmax(torch.rand((2,2,4),dtype=torch.float), torch.FloatTensor([2,3]))
tensor([[[0.4055, 0.5945, 0.0000, 0.0000],
[0.5396, 0.4604, 0.0000, 0.0000]],
[[0.3266, 0.3686, 0.3048, 0.0000],
[0.3770, 0.3882, 0.2348, 0.0000]]])
超出2維矩陣的乘法
和
是維度分別為
和
的張量,進(jìn)行
次二維矩陣乘法后得到
, 維度為
。
torch.bmm(torch.ones((2,1,3), dtype = torch.float), torch.ones((2,3,2), dtype = torch.float))
tensor([[[3., 3.]],
[[3., 3.]]])
點積注意力
The dot product 假設(shè)query和keys有相同的維度, 即 . 通過計算query和key轉(zhuǎn)置的乘積來計算attention score,通常還會除去
減少計算出來的score對維度??的依賴性,如下
假設(shè) 有
個query,
有
個keys. 我們可以通過矩陣運算的方式計算所有
個score:
現(xiàn)在讓我們實現(xiàn)這個層,它支持一批查詢和鍵值對。此外,它支持作為正則化隨機(jī)刪除一些注意力權(quán)重.
# Save to the d2l package.
class DotProductAttention(nn.Module):
def __init__(self, dropout, **kwargs):
super(DotProductAttention, self).__init__(**kwargs)
self.dropout = nn.Dropout(dropout)
# query: (batch_size, #queries, d)
# key: (batch_size, #kv_pairs, d)
# value: (batch_size, #kv_pairs, dim_v)
# valid_length: either (batch_size, ) or (batch_size, xx)
def forward(self, query, key, value, valid_length=None):
d = query.shape[-1]
# set transpose_b=True to swap the last two dimensions of key
scores = torch.bmm(query, key.transpose(1,2)) / math.sqrt(d)
attention_weights = self.dropout(masked_softmax(scores, valid_length))
print("attention_weight\n",attention_weights)
return torch.bmm(attention_weights, value)
測試
現(xiàn)在我們創(chuàng)建了兩個批,每個批有一個query和10個key-values對。我們通過valid_length指定,對于第一批,我們只關(guān)注前2個鍵-值對,而對于第二批,我們將檢查前6個鍵-值對。因此,盡管這兩個批處理具有相同的查詢和鍵值對,但我們獲得的輸出是不同的。
atten = DotProductAttention(dropout=0)
keys = torch.ones((2,10,2),dtype=torch.float)
values = torch.arange((40), dtype=torch.float).view(1,10,4).repeat(2,1,1)
atten(torch.ones((2,1,2),dtype=torch.float), keys, values, torch.FloatTensor([2, 6]))
attention_weight
tensor([[[0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
0.0000, 0.0000]],
[[0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000,
0.0000, 0.0000]]])
tensor([[[ 2.0000, 3.0000, 4.0000, 5.0000]],
[[10.0000, 11.0000, 12.0000, 13.0000]]])
多層感知機(jī)注意力
在多層感知器中,我們首先將 query and keys 投影到 .為了更具體,我們將可以學(xué)習(xí)的參數(shù)做如下映射
,
, and
. 將score函數(shù)定義
.
然后將key 和 value 在特征的維度上合并(concatenate),然后送至 a single hidden layer perceptron 這層中 hidden layer 為 ? and 輸出的size為 1 .隱層激活函數(shù)為tanh,無偏置.
# Save to the d2l package.
class MLPAttention(nn.Module):
def __init__(self, units,ipt_dim,dropout, **kwargs):
super(MLPAttention, self).__init__(**kwargs)
# Use flatten=True to keep query's and key's 3-D shapes.
self.W_k = nn.Linear(ipt_dim, units, bias=False)
self.W_q = nn.Linear(ipt_dim, units, bias=False)
self.v = nn.Linear(units, 1, bias=False)
self.dropout = nn.Dropout(dropout)
def forward(self, query, key, value, valid_length):
query, key = self.W_k(query), self.W_q(key)
#print("size",query.size(),key.size())
# expand query to (batch_size, #querys, 1, units), and key to
# (batch_size, 1, #kv_pairs, units). Then plus them with broadcast.
features = query.unsqueeze(2) + key.unsqueeze(1)
#print("features:",features.size()) #--------------開啟
scores = self.v(features).squeeze(-1)
attention_weights = self.dropout(masked_softmax(scores, valid_length))
return torch.bmm(attention_weights, value)
測試
盡管MLPAttention包含一個額外的MLP模型,但如果給定相同的輸入和相同的鍵,我們將獲得與DotProductAttention相同的輸出
atten = MLPAttention(ipt_dim=2,units = 8, dropout=0)
atten(torch.ones((2,1,2), dtype = torch.float), keys, values, torch.FloatTensor([2, 6]))
tensor([[[ 2.0000, 3.0000, 4.0000, 5.0000]],
[[10.0000, 11.0000, 12.0000, 13.0000]]], grad_fn=<BmmBackward>)
總結(jié)
- 注意力層顯式地選擇相關(guān)的信息。
- 注意層的內(nèi)存由鍵-值對組成,因此它的輸出接近于鍵類似于查詢的值。
引入注意力機(jī)制的Seq2seq模型
本節(jié)中將注意機(jī)制添加到sequence to sequence 模型中,以顯式地使用權(quán)重聚合states。下圖展示encoding 和decoding的模型結(jié)構(gòu),在時間步為t的時候。此刻attention layer保存著encodering看到的所有信息——即encoding的每一步輸出。在decoding階段,解碼器的時刻的隱藏狀態(tài)被當(dāng)作query,encoder的每個時間步的hidden states作為key和value進(jìn)行attention聚合. Attetion model的輸出當(dāng)作成上下文信息context vector,并與解碼器輸入
拼接起來一起送到解碼器:
下圖展示了seq2seq機(jī)制的所以層的關(guān)系,下面展示了encoder和decoder的layer結(jié)構(gòu)
import sys
sys.path.append('/home/kesci/input/d2len9900')
import d2l
解碼器
由于帶有注意機(jī)制的seq2seq的編碼器與之前章節(jié)中的Seq2SeqEncoder相同,所以在此處我們只關(guān)注解碼器。我們添加了一個MLP注意層(MLPAttention),它的隱藏大小與解碼器中的LSTM層相同。然后我們通過從編碼器傳遞三個參數(shù)來初始化解碼器的狀態(tài):
- the encoder outputs of all timesteps:encoder輸出的各個狀態(tài),被用于attetion layer的memory部分,有相同的key和values
- the hidden state of the encoder’s final timestep:編碼器最后一個時間步的隱藏狀態(tài),被用于初始化decoder 的hidden state
- the encoder valid length: 編碼器的有效長度,借此,注意層不會考慮編碼器輸出中的填充標(biāo)記(Paddings)
在解碼的每個時間步,我們使用解碼器的最后一個RNN層的輸出作為注意層的query。然后,將注意力模型的輸出與輸入嵌入向量連接起來,輸入到RNN層。雖然RNN層隱藏狀態(tài)也包含來自解碼器的歷史信息,但是attention model的輸出顯式地選擇了enc_valid_len以內(nèi)的編碼器輸出,這樣attention機(jī)制就會盡可能排除其他不相關(guān)的信息。
class Seq2SeqAttentionDecoder(d2l.Decoder):
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0, **kwargs):
super(Seq2SeqAttentionDecoder, self).__init__(**kwargs)
self.attention_cell = MLPAttention(num_hiddens,num_hiddens, dropout)
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.LSTM(embed_size+ num_hiddens,num_hiddens, num_layers, dropout=dropout)
self.dense = nn.Linear(num_hiddens,vocab_size)
def init_state(self, enc_outputs, enc_valid_len, *args):
outputs, hidden_state = enc_outputs
# print("first:",outputs.size(),hidden_state[0].size(),hidden_state[1].size())
# Transpose outputs to (batch_size, seq_len, hidden_size)
return (outputs.permute(1,0,-1), hidden_state, enc_valid_len)
#outputs.swapaxes(0, 1)
def forward(self, X, state):
enc_outputs, hidden_state, enc_valid_len = state
#("X.size",X.size())
X = self.embedding(X).transpose(0,1)
# print("Xembeding.size2",X.size())
outputs = []
for l, x in enumerate(X):
# print(f"\n{l}-th token")
# print("x.first.size()",x.size())
# query shape: (batch_size, 1, hidden_size)
# select hidden state of the last rnn layer as query
query = hidden_state[0][-1].unsqueeze(1) # np.expand_dims(hidden_state[0][-1], axis=1)
# context has same shape as query
# print("query enc_outputs, enc_outputs:\n",query.size(), enc_outputs.size(), enc_outputs.size())
context = self.attention_cell(query, enc_outputs, enc_outputs, enc_valid_len)
# Concatenate on the feature dimension
# print("context.size:",context.size())
x = torch.cat((context, x.unsqueeze(1)), dim=-1)
# Reshape x to (1, batch_size, embed_size+hidden_size)
# print("rnn",x.size(), len(hidden_state))
out, hidden_state = self.rnn(x.transpose(0,1), hidden_state)
outputs.append(out)
outputs = self.dense(torch.cat(outputs, dim=0))
return outputs.transpose(0, 1), [enc_outputs, hidden_state,
enc_valid_len]
現(xiàn)在我們可以用注意力模型來測試seq2seq。為了與第9.7節(jié)中的模型保持一致,我們對vocab_size、embed_size、num_hiddens和num_layers使用相同的超參數(shù)。結(jié)果,我們得到了相同的解碼器輸出形狀,但是狀態(tài)結(jié)構(gòu)改變了。
encoder = d2l.Seq2SeqEncoder(vocab_size=10, embed_size=8,
num_hiddens=16, num_layers=2)
# encoder.initialize()
decoder = Seq2SeqAttentionDecoder(vocab_size=10, embed_size=8,
num_hiddens=16, num_layers=2)
X = torch.zeros((4, 7),dtype=torch.long)
print("batch size=4\nseq_length=7\nhidden dim=16\nnum_layers=2\n")
print('encoder output size:', encoder(X)[0].size())
print('encoder hidden size:', encoder(X)[1][0].size())
print('encoder memory size:', encoder(X)[1][1].size())
state = decoder.init_state(encoder(X), None)
out, state = decoder(X, state)
out.shape, len(state), state[0].shape, len(state[1]), state[1][0].shape
訓(xùn)練
與第9.7.4節(jié)相似,通過應(yīng)用相同的訓(xùn)練超參數(shù)和相同的訓(xùn)練損失來嘗試一個簡單的娛樂模型。從結(jié)果中我們可以看出,由于訓(xùn)練數(shù)據(jù)集中的序列相對較短,額外的注意層并沒有帶來顯著的改進(jìn)。由于編碼器和解碼器的注意層的計算開銷,該模型比沒有注意的seq2seq模型慢得多。
import zipfile
import torch
import requests
from io import BytesIO
from torch.utils import data
import sys
import collections
class Vocab(object): # This class is saved in d2l.
def __init__(self, tokens, min_freq=0, use_special_tokens=False):
# sort by frequency and token
counter = collections.Counter(tokens)
token_freqs = sorted(counter.items(), key=lambda x: x[0])
token_freqs.sort(key=lambda x: x[1], reverse=True)
if use_special_tokens:
# padding, begin of sentence, end of sentence, unknown
self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3)
tokens = ['', '', '', '']
else:
self.unk = 0
tokens = ['']
tokens += [token for token, freq in token_freqs if freq >= min_freq]
self.idx_to_token = []
self.token_to_idx = dict()
for token in tokens:
self.idx_to_token.append(token)
self.token_to_idx[token] = len(self.idx_to_token) - 1
def __len__(self):
return len(self.idx_to_token)
def __getitem__(self, tokens):
if not isinstance(tokens, (list, tuple)):
return self.token_to_idx.get(tokens, self.unk)
else:
return [self.__getitem__(token) for token in tokens]
def to_tokens(self, indices):
if not isinstance(indices, (list, tuple)):
return self.idx_to_token[indices]
else:
return [self.idx_to_token[index] for index in indices]
def load_data_nmt(batch_size, max_len, num_examples=1000):
"""Download an NMT dataset, return its vocabulary and data iterator."""
# Download and preprocess
def preprocess_raw(text):
text = text.replace('\u202f', ' ').replace('\xa0', ' ')
out = ''
for i, char in enumerate(text.lower()):
if char in (',', '!', '.') and text[i-1] != ' ':
out += ' '
out += char
return out
with open('/home/kesci/input/fraeng6506/fra.txt', 'r') as f:
raw_text = f.read()
text = preprocess_raw(raw_text)
# Tokenize
source, target = [], []
for i, line in enumerate(text.split('\n')):
if i >= num_examples:
break
parts = line.split('\t')
if len(parts) >= 2:
source.append(parts[0].split(' '))
target.append(parts[1].split(' '))
# Build vocab
def build_vocab(tokens):
tokens = [token for line in tokens for token in line]
return Vocab(tokens, min_freq=3, use_special_tokens=True)
src_vocab, tgt_vocab = build_vocab(source), build_vocab(target)
# Convert to index arrays
def pad(line, max_len, padding_token):
if len(line) > max_len:
return line[:max_len]
return line + [padding_token] * (max_len - len(line))
def build_array(lines, vocab, max_len, is_source):
lines = [vocab[line] for line in lines]
if not is_source:
lines = [[vocab.bos] + line + [vocab.eos] for line in lines]
array = torch.tensor([pad(line, max_len, vocab.pad) for line in lines])
valid_len = (array != vocab.pad).sum(1)
return array, valid_len
src_vocab, tgt_vocab = build_vocab(source), build_vocab(target)
src_array, src_valid_len = build_array(source, src_vocab, max_len, True)
tgt_array, tgt_valid_len = build_array(target, tgt_vocab, max_len, False)
train_data = data.TensorDataset(src_array, src_valid_len, tgt_array, tgt_valid_len)
train_iter = data.DataLoader(train_data, batch_size, shuffle=True)
return src_vocab, tgt_vocab, train_iter
embed_size, num_hiddens, num_layers, dropout = 32, 32, 2, 0.0
batch_size, num_steps = 64, 10
lr, num_epochs, ctx = 0.005, 500, d2l.try_gpu()
src_vocab, tgt_vocab, train_iter = load_data_nmt(batch_size, num_steps)
encoder = d2l.Seq2SeqEncoder(
len(src_vocab), embed_size, num_hiddens, num_layers, dropout)
decoder = Seq2SeqAttentionDecoder(
len(tgt_vocab), embed_size, num_hiddens, num_layers, dropout)
model = d2l.EncoderDecoder(encoder, decoder)
訓(xùn)練和預(yù)測
d2l.train_s2s_ch9(model, train_iter, lr, num_epochs, ctx)
for sentence in ['Go .', 'Good Night !', "I'm OK .", 'I won !']:
print(sentence + ' => ' + d2l.predict_s2s_ch9(
model, sentence, src_vocab, tgt_vocab, num_steps, ctx))
Transformer
在之前的章節(jié)中,我們已經(jīng)介紹了主流的神經(jīng)網(wǎng)絡(luò)架構(gòu)如卷積神經(jīng)網(wǎng)絡(luò)(CNNs)和循環(huán)神經(jīng)網(wǎng)絡(luò)(RNNs)。讓我們進(jìn)行一些回顧:
- CNNs 易于并行化,卻不適合捕捉變長序列內(nèi)的依賴關(guān)系。
- RNNs 適合捕捉長距離變長序列的依賴,但是卻難以實現(xiàn)并行化處理序列。
為了整合CNN和RNN的優(yōu)勢,[Vaswani et al., 2017] 創(chuàng)新性地使用注意力機(jī)制設(shè)計了Transformer模型。該模型利用attention機(jī)制實現(xiàn)了并行化捕捉序列依賴,并且同時處理序列的每個位置的tokens,上述優(yōu)勢使得Transformer模型在性能優(yōu)異的同時大大減少了訓(xùn)練時間。
圖10.3.1展示了Transformer模型的架構(gòu),與9.7節(jié)的seq2seq模型相似,Transformer同樣基于編碼器-解碼器架構(gòu),其區(qū)別主要在于以下三點:
- Transformer blocks:將seq2seq模型重的循環(huán)網(wǎng)絡(luò)替換為了Transformer Blocks,該模塊包含一個多頭注意力層(Multi-head Attention Layers)以及兩個position-wise feed-forward networks(FFN)。對于解碼器來說,另一個多頭注意力層被用于接受編碼器的隱藏狀態(tài)。
- Add and norm:多頭注意力層和前饋網(wǎng)絡(luò)的輸出被送到兩個“add and norm”層進(jìn)行處理,該層包含殘差結(jié)構(gòu)以及層歸一化。
- Position encoding:由于自注意力層并沒有區(qū)分元素的順序,所以一個位置編碼層被用于向序列元素里添加位置信息。
在接下來的部分,我們將會帶領(lǐng)大家實現(xiàn)Transformer里全新的子結(jié)構(gòu),并且構(gòu)建一個神經(jīng)機(jī)器翻譯模型用以訓(xùn)練和測試。
import os
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import sys
sys.path.append('/home/kesci/input/d2len9900')
import d2l
以下是復(fù)制了上一小節(jié)中 masked softmax 實現(xiàn),這里就不再贅述了。
def SequenceMask(X, X_len,value=-1e6):
maxlen = X.size(1)
X_len = X_len.to(X.device)
#print(X.size(),torch.arange((maxlen),dtype=torch.float)[None, :],'\n',X_len[:, None] )
mask = torch.arange((maxlen), dtype=torch.float, device=X.device)
mask = mask[None, :] < X_len[:, None]
#print(mask)
X[~mask]=value
return X
def masked_softmax(X, valid_length):
# X: 3-D tensor, valid_length: 1-D or 2-D tensor
softmax = nn.Softmax(dim=-1)
if valid_length is None:
return softmax(X)
else:
shape = X.shape
if valid_length.dim() == 1:
try:
valid_length = torch.FloatTensor(valid_length.numpy().repeat(shape[1], axis=0))#[2,2,3,3]
except:
valid_length = torch.FloatTensor(valid_length.cpu().numpy().repeat(shape[1], axis=0))#[2,2,3,3]
else:
valid_length = valid_length.reshape((-1,))
# fill masked elements with a large negative, whose exp is 0
X = SequenceMask(X.reshape((-1, shape[-1])), valid_length)
return softmax(X).reshape(shape)
# Save to the d2l package.
class DotProductAttention(nn.Module):
def __init__(self, dropout, **kwargs):
super(DotProductAttention, self).__init__(**kwargs)
self.dropout = nn.Dropout(dropout)
# query: (batch_size, #queries, d)
# key: (batch_size, #kv_pairs, d)
# value: (batch_size, #kv_pairs, dim_v)
# valid_length: either (batch_size, ) or (batch_size, xx)
def forward(self, query, key, value, valid_length=None):
d = query.shape[-1]
# set transpose_b=True to swap the last two dimensions of key
scores = torch.bmm(query, key.transpose(1,2)) / math.sqrt(d)
attention_weights = self.dropout(masked_softmax(scores, valid_length))
return torch.bmm(attention_weights, value)
多頭注意力層
在我們討論多頭注意力層之前,先來迅速理解以下自注意力(self-attention)的結(jié)構(gòu)。自注意力模型是一個正規(guī)的注意力模型,序列的每一個元素對應(yīng)的key,value,query是完全一致的。如圖10.3.2 自注意力輸出了一個與輸入長度相同的表征序列,與循環(huán)神經(jīng)網(wǎng)絡(luò)相比,自注意力對每個元素輸出的計算是并行的,所以我們可以高效的實現(xiàn)這個模塊。
多頭注意力層包含個并行的自注意力層,每一個這種層被成為一個head。對每個頭來說,在進(jìn)行注意力計算之前,我們會將query、key和value用三個現(xiàn)行層進(jìn)行映射,這
個注意力頭的輸出將會被拼接之后輸入最后一個線性層進(jìn)行整合。
假設(shè)query,key和value的維度分別是、
和
。那么對于每一個頭
,我們可以訓(xùn)練相應(yīng)的模型權(quán)重
、
和
,以得到每個頭的輸出:
這里的attention可以是任意的attention function,比如前一節(jié)介紹的dot-product attention以及MLP attention。之后我們將所有head對應(yīng)的輸出拼接起來,送入最后一個線性層進(jìn)行整合,這個層的權(quán)重可以表示為
接下來我們就可以來實現(xiàn)多頭注意力了,假設(shè)我們有h個頭,隱藏層權(quán)重 與query,key,value的維度一致。除此之外,因為多頭注意力層保持輸入與輸出張量的維度不變,所以輸出feature的維度也設(shè)置為
。
class MultiHeadAttention(nn.Module):
def __init__(self, input_size, hidden_size, num_heads, dropout, **kwargs):
super(MultiHeadAttention, self).__init__(**kwargs)
self.num_heads = num_heads
self.attention = DotProductAttention(dropout)
self.W_q = nn.Linear(input_size, hidden_size, bias=False)
self.W_k = nn.Linear(input_size, hidden_size, bias=False)
self.W_v = nn.Linear(input_size, hidden_size, bias=False)
self.W_o = nn.Linear(hidden_size, hidden_size, bias=False)
def forward(self, query, key, value, valid_length):
# query, key, and value shape: (batch_size, seq_len, dim),
# where seq_len is the length of input sequence
# valid_length shape is either (batch_size, )
# or (batch_size, seq_len).
# Project and transpose query, key, and value from
# (batch_size, seq_len, hidden_size * num_heads) to
# (batch_size * num_heads, seq_len, hidden_size).
query = transpose_qkv(self.W_q(query), self.num_heads)
key = transpose_qkv(self.W_k(key), self.num_heads)
value = transpose_qkv(self.W_v(value), self.num_heads)
if valid_length is not None:
# Copy valid_length by num_heads times
device = valid_length.device
valid_length = valid_length.cpu().numpy() if valid_length.is_cuda else valid_length.numpy()
if valid_length.ndim == 1:
valid_length = torch.FloatTensor(np.tile(valid_length, self.num_heads))
else:
valid_length = torch.FloatTensor(np.tile(valid_length, (self.num_heads,1)))
valid_length = valid_length.to(device)
output = self.attention(query, key, value, valid_length)
output_concat = transpose_output(output, self.num_heads)
return self.W_o(output_concat)
def transpose_qkv(X, num_heads):
# Original X shape: (batch_size, seq_len, hidden_size * num_heads),
# -1 means inferring its value, after first reshape, X shape:
# (batch_size, seq_len, num_heads, hidden_size)
X = X.view(X.shape[0], X.shape[1], num_heads, -1)
# After transpose, X shape: (batch_size, num_heads, seq_len, hidden_size)
X = X.transpose(2, 1).contiguous()
# Merge the first two dimensions. Use reverse=True to infer shape from
# right to left.
# output shape: (batch_size * num_heads, seq_len, hidden_size)
output = X.view(-1, X.shape[2], X.shape[3])
return output
# Saved in the d2l package for later use
def transpose_output(X, num_heads):
# A reversed version of transpose_qkv
X = X.view(-1, num_heads, X.shape[1], X.shape[2])
X = X.transpose(2, 1).contiguous()
return X.view(X.shape[0], X.shape[1], -1)
cell = MultiHeadAttention(5, 9, 3, 0.5)
X = torch.ones((2, 4, 5))
valid_length = torch.FloatTensor([2, 3])
cell(X, X, X, valid_length).shape
基于位置的前饋網(wǎng)絡(luò)
Transformer 模塊另一個非常重要的部分就是基于位置的前饋網(wǎng)絡(luò)(FFN),它接受一個形狀為(batch_size,seq_length, feature_size)的三維張量。Position-wise FFN由兩個全連接層組成,他們作用在最后一維上。因為序列的每個位置的狀態(tài)都會被單獨地更新,所以我們稱他為position-wise,這等效于一個1x1的卷積。
下面我們來實現(xiàn)PositionWiseFFN:
# Save to the d2l package.
class PositionWiseFFN(nn.Module):
def __init__(self, input_size, ffn_hidden_size, hidden_size_out, **kwargs):
super(PositionWiseFFN, self).__init__(**kwargs)
self.ffn_1 = nn.Linear(input_size, ffn_hidden_size)
self.ffn_2 = nn.Linear(ffn_hidden_size, hidden_size_out)
def forward(self, X):
return self.ffn_2(F.relu(self.ffn_1(X)))
與多頭注意力層相似,F(xiàn)FN層同樣只會對最后一維的大小進(jìn)行改變;除此之外,對于兩個完全相同的輸入,F(xiàn)FN層的輸出也將相等。
ffn = PositionWiseFFN(4, 4, 8)
out = ffn(torch.ones((2,3,4)))
print(out, out.shape)
Add and Norm
除了上面兩個模塊之外,Transformer還有一個重要的相加歸一化層,它可以平滑地整合輸入和其他層的輸出,因此我們在每個多頭注意力層和FFN層后面都添加一個含殘差連接的Layer Norm層。這里 Layer Norm 與7.5小節(jié)的Batch Norm很相似,唯一的區(qū)別在于Batch Norm是對于batch size這個維度進(jìn)行計算均值和方差的,而Layer Norm則是對最后一維進(jìn)行計算。層歸一化可以防止層內(nèi)的數(shù)值變化過大,從而有利于加快訓(xùn)練速度并且提高泛化性能。 (ref)
layernorm = nn.LayerNorm(normalized_shape=2, elementwise_affine=True)
batchnorm = nn.BatchNorm1d(num_features=2, affine=True)
X = torch.FloatTensor([[1,2], [3,4]])
print('layer norm:', layernorm(X))
print('batch norm:', batchnorm(X))
# Save to the d2l package.
class AddNorm(nn.Module):
def __init__(self, hidden_size, dropout, **kwargs):
super(AddNorm, self).__init__(**kwargs)
self.dropout = nn.Dropout(dropout)
self.norm = nn.LayerNorm(hidden_size)
def forward(self, X, Y):
return self.norm(self.dropout(Y) + X)
由于殘差連接,X和Y需要有相同的維度。
add_norm = AddNorm(4, 0.5)
add_norm(torch.ones((2,3,4)), torch.ones((2,3,4))).shape
位置編碼
與循環(huán)神經(jīng)網(wǎng)絡(luò)不同,無論是多頭注意力網(wǎng)絡(luò)還是前饋神經(jīng)網(wǎng)絡(luò)都是獨立地對每個位置的元素進(jìn)行更新,這種特性幫助我們實現(xiàn)了高效的并行,卻丟失了重要的序列順序的信息。為了更好的捕捉序列信息,Transformer模型引入了位置編碼去保持輸入序列元素的位置。
假設(shè)輸入序列的嵌入表示 , 序列長度為
嵌入向量維度為
,則其位置編碼為
,輸出的向量就是二者相加
。
位置編碼是一個二維的矩陣,i對應(yīng)著序列中的順序,j對應(yīng)其embedding vector內(nèi)部的維度索引。我們可以通過以下等式計算位置編碼:
class PositionalEncoding(nn.Module):
def __init__(self, embedding_size, dropout, max_len=1000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(dropout)
self.P = np.zeros((1, max_len, embedding_size))
X = np.arange(0, max_len).reshape(-1, 1) / np.power(
10000, np.arange(0, embedding_size, 2)/embedding_size)
self.P[:, :, 0::2] = np.sin(X)
self.P[:, :, 1::2] = np.cos(X)
self.P = torch.FloatTensor(self.P)
def forward(self, X):
if X.is_cuda and not self.P.is_cuda:
self.P = self.P.cuda()
X = X + self.P[:, :X.shape[1], :]
return self.dropout(X)
測試
下面我們用PositionalEncoding這個類進(jìn)行一個小測試,取其中的四個維度進(jìn)行可視化。 我們可以看到,第4維和第5維有相同的頻率但偏置不同。第6維和第7維具有更低的頻率;因此positional encoding對于不同維度具有可區(qū)分性。
import numpy as np
pe = PositionalEncoding(20, 0)
Y = pe(torch.zeros((1, 100, 20))).numpy()
d2l.plot(np.arange(100), Y[0, :, 4:8].T, figsize=(6, 2.5),
legend=["dim %d" % p for p in [4, 5, 6, 7]])
編碼器
我們已經(jīng)有了組成Transformer的各個模塊,現(xiàn)在我們可以開始搭建了!編碼器包含一個多頭注意力層,一個position-wise FFN,和兩個 Add and Norm層。對于attention模型以及FFN模型,我們的輸出維度都是與embedding維度一致的,這也是由于殘差連接天生的特性導(dǎo)致的,因為我們要將前一層的輸出與原始輸入相加并歸一化。
class EncoderBlock(nn.Module):
def __init__(self, embedding_size, ffn_hidden_size, num_heads,
dropout, **kwargs):
super(EncoderBlock, self).__init__(**kwargs)
self.attention = MultiHeadAttention(embedding_size, embedding_size, num_heads, dropout)
self.addnorm_1 = AddNorm(embedding_size, dropout)
self.ffn = PositionWiseFFN(embedding_size, ffn_hidden_size, embedding_size)
self.addnorm_2 = AddNorm(embedding_size, dropout)
def forward(self, X, valid_length):
Y = self.addnorm_1(X, self.attention(X, X, X, valid_length))
return self.addnorm_2(Y, self.ffn(Y))
# batch_size = 2, seq_len = 100, embedding_size = 24
# ffn_hidden_size = 48, num_head = 8, dropout = 0.5
X = torch.ones((2, 100, 24))
encoder_blk = EncoderBlock(24, 48, 8, 0.5)
encoder_blk(X, valid_length).shape
現(xiàn)在我們來實現(xiàn)整個Transformer 編碼器模型,整個編碼器由n個剛剛定義的Encoder Block堆疊而成,因為殘差連接的緣故,中間狀態(tài)的維度始終與嵌入向量的維度d一致;同時注意到我們把嵌入向量乘以 以防止其值過小。
class TransformerEncoder(d2l.Encoder):
def __init__(self, vocab_size, embedding_size, ffn_hidden_size,
num_heads, num_layers, dropout, **kwargs):
super(TransformerEncoder, self).__init__(**kwargs)
self.embedding_size = embedding_size
self.embed = nn.Embedding(vocab_size, embedding_size)
self.pos_encoding = PositionalEncoding(embedding_size, dropout)
self.blks = nn.ModuleList()
for i in range(num_layers):
self.blks.append(
EncoderBlock(embedding_size, ffn_hidden_size,
num_heads, dropout))
def forward(self, X, valid_length, *args):
X = self.pos_encoding(self.embed(X) * math.sqrt(self.embedding_size))
for blk in self.blks:
X = blk(X, valid_length)
return X
# test encoder
encoder = TransformerEncoder(200, 24, 48, 8, 2, 0.5)
encoder(torch.ones((2, 100)).long(), valid_length).shape
解碼器
Transformer 模型的解碼器與編碼器結(jié)構(gòu)類似,然而,除了之前介紹的幾個模塊之外,編碼器部分有另一個子模塊。該模塊也是多頭注意力層,接受編碼器的輸出作為key和value,decoder的狀態(tài)作為query。與編碼器部分相類似,解碼器同樣是使用了add and norm機(jī)制,用殘差和層歸一化將各個子層的輸出相連。
仔細(xì)來講,在第t個時間步,當(dāng)前輸入是query,那么self attention接受了第t步以及前t-1步的所有輸入
。在訓(xùn)練時,由于第t位置的輸入可以觀測到全部的序列,這與預(yù)測階段的情形項矛盾,所以我們要通過將第t個時間步所對應(yīng)的可觀測長度設(shè)置為t,以消除不需要看到的未來的信息。
class DecoderBlock(nn.Module):
def __init__(self, embedding_size, ffn_hidden_size, num_heads,dropout,i,**kwargs):
super(DecoderBlock, self).__init__(**kwargs)
self.i = i
self.attention_1 = MultiHeadAttention(embedding_size, embedding_size, num_heads, dropout)
self.addnorm_1 = AddNorm(embedding_size, dropout)
self.attention_2 = MultiHeadAttention(embedding_size, embedding_size, num_heads, dropout)
self.addnorm_2 = AddNorm(embedding_size, dropout)
self.ffn = PositionWiseFFN(embedding_size, ffn_hidden_size, embedding_size)
self.addnorm_3 = AddNorm(embedding_size, dropout)
def forward(self, X, state):
enc_outputs, enc_valid_length = state[0], state[1]
# state[2][self.i] stores all the previous t-1 query state of layer-i
# len(state[2]) = num_layers
# If training:
# state[2] is useless.
# If predicting:
# In the t-th timestep:
# state[2][self.i].shape = (batch_size, t-1, hidden_size)
# Demo:
# love dogs ! [EOS]
# | | | |
# Transformer
# Decoder
# | | | |
# I love dogs !
if state[2][self.i] is None:
key_values = X
else:
# shape of key_values = (batch_size, t, hidden_size)
key_values = torch.cat((state[2][self.i], X), dim=1)
state[2][self.i] = key_values
if self.training:
batch_size, seq_len, _ = X.shape
# Shape: (batch_size, seq_len), the values in the j-th column are j+1
valid_length = torch.FloatTensor(np.tile(np.arange(1, seq_len+1), (batch_size, 1)))
valid_length = valid_length.to(X.device)
else:
valid_length = None
X2 = self.attention_1(X, key_values, key_values, valid_length)
Y = self.addnorm_1(X, X2)
Y2 = self.attention_2(Y, enc_outputs, enc_outputs, enc_valid_length)
Z = self.addnorm_2(Y, Y2)
return self.addnorm_3(Z, self.ffn(Z)), state
decoder_blk = DecoderBlock(24, 48, 8, 0.5, 0)
X = torch.ones((2, 100, 24))
state = [encoder_blk(X, valid_length), valid_length, [None]]
decoder_blk(X, state)[0].shape
對于Transformer解碼器來說,構(gòu)造方式與編碼器一樣,除了最后一層添加一個dense layer以獲得輸出的置信度分?jǐn)?shù)。下面讓我們來實現(xiàn)一下Transformer Decoder,除了常規(guī)的超參數(shù)例如vocab_size embedding_size 之外,解碼器還需要編碼器的輸出 enc_outputs 和句子有效長度 enc_valid_length。
class TransformerDecoder(d2l.Decoder):
def __init__(self, vocab_size, embedding_size, ffn_hidden_size,
num_heads, num_layers, dropout, **kwargs):
super(TransformerDecoder, self).__init__(**kwargs)
self.embedding_size = embedding_size
self.num_layers = num_layers
self.embed = nn.Embedding(vocab_size, embedding_size)
self.pos_encoding = PositionalEncoding(embedding_size, dropout)
self.blks = nn.ModuleList()
for i in range(num_layers):
self.blks.append(
DecoderBlock(embedding_size, ffn_hidden_size, num_heads,
dropout, i))
self.dense = nn.Linear(embedding_size, vocab_size)
def init_state(self, enc_outputs, enc_valid_length, *args):
return [enc_outputs, enc_valid_length, [None]*self.num_layers]
def forward(self, X, state):
X = self.pos_encoding(self.embed(X) * math.sqrt(self.embedding_size))
for blk in self.blks:
X, state = blk(X, state)
return self.dense(X), state
訓(xùn)練
import zipfile
import torch
import requests
from io import BytesIO
from torch.utils import data
import sys
import collections
class Vocab(object): # This class is saved in d2l.
def __init__(self, tokens, min_freq=0, use_special_tokens=False):
# sort by frequency and token
counter = collections.Counter(tokens)
token_freqs = sorted(counter.items(), key=lambda x: x[0])
token_freqs.sort(key=lambda x: x[1], reverse=True)
if use_special_tokens:
# padding, begin of sentence, end of sentence, unknown
self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3)
tokens = ['', '', '', '']
else:
self.unk = 0
tokens = ['']
tokens += [token for token, freq in token_freqs if freq >= min_freq]
self.idx_to_token = []
self.token_to_idx = dict()
for token in tokens:
self.idx_to_token.append(token)
self.token_to_idx[token] = len(self.idx_to_token) - 1
def __len__(self):
return len(self.idx_to_token)
def __getitem__(self, tokens):
if not isinstance(tokens, (list, tuple)):
return self.token_to_idx.get(tokens, self.unk)
else:
return [self.__getitem__(token) for token in tokens]
def to_tokens(self, indices):
if not isinstance(indices, (list, tuple)):
return self.idx_to_token[indices]
else:
return [self.idx_to_token[index] for index in indices]
def load_data_nmt(batch_size, max_len, num_examples=1000):
"""Download an NMT dataset, return its vocabulary and data iterator."""
# Download and preprocess
def preprocess_raw(text):
text = text.replace('\u202f', ' ').replace('\xa0', ' ')
out = ''
for i, char in enumerate(text.lower()):
if char in (',', '!', '.') and text[i-1] != ' ':
out += ' '
out += char
return out
with open('/home/kesci/input/fraeng6506/fra.txt', 'r') as f:
raw_text = f.read()
text = preprocess_raw(raw_text)
# Tokenize
source, target = [], []
for i, line in enumerate(text.split('\n')):
if i >= num_examples:
break
parts = line.split('\t')
if len(parts) >= 2:
source.append(parts[0].split(' '))
target.append(parts[1].split(' '))
# Build vocab
def build_vocab(tokens):
tokens = [token for line in tokens for token in line]
return Vocab(tokens, min_freq=3, use_special_tokens=True)
src_vocab, tgt_vocab = build_vocab(source), build_vocab(target)
# Convert to index arrays
def pad(line, max_len, padding_token):
if len(line) > max_len:
return line[:max_len]
return line + [padding_token] * (max_len - len(line))
def build_array(lines, vocab, max_len, is_source):
lines = [vocab[line] for line in lines]
if not is_source:
lines = [[vocab.bos] + line + [vocab.eos] for line in lines]
array = torch.tensor([pad(line, max_len, vocab.pad) for line in lines])
valid_len = (array != vocab.pad).sum(1)
return array, valid_len
src_vocab, tgt_vocab = build_vocab(source), build_vocab(target)
src_array, src_valid_len = build_array(source, src_vocab, max_len, True)
tgt_array, tgt_valid_len = build_array(target, tgt_vocab, max_len, False)
train_data = data.TensorDataset(src_array, src_valid_len, tgt_array, tgt_valid_len)
train_iter = data.DataLoader(train_data, batch_size, shuffle=True)
return src_vocab, tgt_vocab, train_iter
import os
import d2l
# 平臺暫時不支持gpu,現(xiàn)在會自動使用cpu訓(xùn)練,gpu可以用了之后會使用gpu來訓(xùn)練
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
embed_size, embedding_size, num_layers, dropout = 32, 32, 2, 0.05
batch_size, num_steps = 64, 10
lr, num_epochs, ctx = 0.005, 250, d2l.try_gpu()
print(ctx)
num_hiddens, num_heads = 64, 4
src_vocab, tgt_vocab, train_iter = load_data_nmt(batch_size, num_steps)
encoder = TransformerEncoder(
len(src_vocab), embedding_size, num_hiddens, num_heads, num_layers,
dropout)
decoder = TransformerDecoder(
len(src_vocab), embedding_size, num_hiddens, num_heads, num_layers,
dropout)
model = d2l.EncoderDecoder(encoder, decoder)
d2l.train_s2s_ch9(model, train_iter, lr, num_epochs, ctx)
model.eval()
for sentence in ['Go .', 'Wow !', "I'm OK .", 'I won !']:
print(sentence + ' => ' + d2l.predict_s2s_ch9(
model, sentence, src_vocab, tgt_vocab, num_steps, ctx))
print("END")