.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python class Seq2SeqAttentionDecoder(AttentionDecoder): def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout=0): super().__init__() self.attention = d2l.AdditiveAttention(num_hiddens, dropout) self.embedding = nn.Embedding(vocab_size, embed_size) self.rnn = nn.GRU( embed_size + num_hiddens, num_hiddens, num_layers, dropout=dropout) self.dense = nn.LazyLinear(vocab_size) self.apply(d2l.init_seq2seq) def init_state(self, enc_outputs, enc_valid_lens): # Shape of outputs: (num_steps, batch_size, num_hiddens). # Shape of hidden_state: (num_layers, batch_size, num_hiddens) outputs, hidden_state = enc_outputs return (outputs.permute(1, 0, 2), hidden_state, enc_valid_lens) def forward(self, X, state): # Shape of enc_outputs: (batch_size, num_steps, num_hiddens). # Shape of hidden_state: (num_layers, batch_size, num_hiddens) enc_outputs, hidden_state, enc_valid_lens = state # Shape of the output X: (num_steps, batch_size, embed_size) X = self.embedding(X).permute(1, 0, 2) outputs, self._attention_weights = [], [] for x in X: # Shape of query: (batch_size, 1, num_hiddens) query = torch.unsqueeze(hidden_state[-1], dim=1) # Shape of context: (batch_size, 1, num_hiddens) context = self.attention( query, enc_outputs, enc_outputs, enc_valid_lens) # Concatenate on the feature dimension x = torch.cat((context, torch.unsqueeze(x, dim=1)), dim=-1) # Reshape x as (1, batch_size, embed_size + num_hiddens) out, hidden_state = self.rnn(x.permute(1, 0, 2), hidden_state) outputs.append(out) self._attention_weights.append(self.attention.attention_weights) # After fully connected layer transformation, shape of outputs: # (num_steps, batch_size, vocab_size) outputs = self.dense(torch.cat(outputs, dim=0)) return outputs.permute(1, 0, 2), [enc_outputs, hidden_state, enc_valid_lens] @property def attention_weights(self): return self._attention_weights .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python class Seq2SeqAttentionDecoder(AttentionDecoder): def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout=0): super().__init__() self.attention = d2l.AdditiveAttention(num_hiddens, dropout) self.embedding = nn.Embedding(vocab_size, embed_size) self.rnn = rnn.GRU(num_hiddens, num_layers, dropout=dropout) self.dense = nn.Dense(vocab_size, flatten=False) self.initialize(init.Xavier()) def init_state(self, enc_outputs, enc_valid_lens): # Shape of outputs: (num_steps, batch_size, num_hiddens). # Shape of hidden_state: (num_layers, batch_size, num_hiddens) outputs, hidden_state = enc_outputs return (outputs.swapaxes(0, 1), hidden_state, enc_valid_lens) def forward(self, X, state): # Shape of enc_outputs: (batch_size, num_steps, num_hiddens). # Shape of hidden_state: (num_layers, batch_size, num_hiddens) enc_outputs, hidden_state, enc_valid_lens = state # Shape of the output X: (num_steps, batch_size, embed_size) X = self.embedding(X).swapaxes(0, 1) outputs, self._attention_weights = [], [] for x in X: # Shape of query: (batch_size, 1, num_hiddens) query = np.expand_dims(hidden_state[-1], axis=1) # Shape of context: (batch_size, 1, num_hiddens) context = self.attention( query, enc_outputs, enc_outputs, enc_valid_lens) # Concatenate on the feature dimension x = np.concatenate((context, np.expand_dims(x, axis=1)), axis=-1) # Reshape x as (1, batch_size, embed_size + num_hiddens) out, hidden_state = self.rnn(x.swapaxes(0, 1), hidden_state) hidden_state = hidden_state[0] outputs.append(out) self._attention_weights.append(self.attention.attention_weights) # After fully connected layer transformation, shape of outputs: # (num_steps, batch_size, vocab_size) outputs = self.dense(np.concatenate(outputs, axis=0)) return outputs.swapaxes(0, 1), [enc_outputs, hidden_state, enc_valid_lens] @property def attention_weights(self): return self._attention_weights .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python class Seq2SeqAttentionDecoder(nn.Module): vocab_size: int embed_size: int num_hiddens: int num_layers: int dropout: float = 0 def setup(self): self.attention = d2l.AdditiveAttention(self.num_hiddens, self.dropout) self.embedding = nn.Embed(self.vocab_size, self.embed_size) self.dense = nn.Dense(self.vocab_size) self.rnn = d2l.GRU(num_hiddens, num_layers, dropout=self.dropout) def init_state(self, enc_outputs, enc_valid_lens, *args): # Shape of outputs: (num_steps, batch_size, num_hiddens). # Shape of hidden_state: (num_layers, batch_size, num_hiddens) outputs, hidden_state = enc_outputs # Attention Weights are returned as part of state; init with None return (outputs.transpose(1, 0, 2), hidden_state, enc_valid_lens) @nn.compact def __call__(self, X, state, training=False): # Shape of enc_outputs: (batch_size, num_steps, num_hiddens). # Shape of hidden_state: (num_layers, batch_size, num_hiddens) # Ignore Attention value in state enc_outputs, hidden_state, enc_valid_lens = state # Shape of the output X: (num_steps, batch_size, embed_size) X = self.embedding(X).transpose(1, 0, 2) outputs, attention_weights = [], [] for x in X: # Shape of query: (batch_size, 1, num_hiddens) query = jnp.expand_dims(hidden_state[-1], axis=1) # Shape of context: (batch_size, 1, num_hiddens) context, attention_w = self.attention(query, enc_outputs, enc_outputs, enc_valid_lens, training=training) # Concatenate on the feature dimension x = jnp.concatenate((context, jnp.expand_dims(x, axis=1)), axis=-1) # Reshape x as (1, batch_size, embed_size + num_hiddens) out, hidden_state = self.rnn(x.transpose(1, 0, 2), hidden_state, training=training) outputs.append(out) attention_weights.append(attention_w) # Flax sow API is used to capture intermediate variables self.sow('intermediates', 'dec_attention_weights', attention_weights) # After fully connected layer transformation, shape of outputs: # (num_steps, batch_size, vocab_size) outputs = self.dense(jnp.concatenate(outputs, axis=0)) return outputs.transpose(1, 0, 2), [enc_outputs, hidden_state, enc_valid_lens] .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python class Seq2SeqAttentionDecoder(AttentionDecoder): def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout=0): super().__init__() self.attention = d2l.AdditiveAttention(num_hiddens, num_hiddens, num_hiddens, dropout) self.embedding = tf.keras.layers.Embedding(vocab_size, embed_size) self.rnn = tf.keras.layers.RNN(tf.keras.layers.StackedRNNCells( [tf.keras.layers.GRUCell(num_hiddens, dropout=dropout) for _ in range(num_layers)]), return_sequences=True, return_state=True) self.dense = tf.keras.layers.Dense(vocab_size) def init_state(self, enc_outputs, enc_valid_lens): # Shape of outputs: (batch_size, num_steps, num_hiddens). # Length of list hidden_state is num_layers, where the shape of its # element is (batch_size, num_hiddens) outputs, hidden_state = enc_outputs return (tf.transpose(outputs, (1, 0, 2)), hidden_state, enc_valid_lens) def call(self, X, state, **kwargs): # Shape of output enc_outputs: # (batch_size, num_steps, num_hiddens) # Length of list hidden_state is num_layers, where the shape of its # element is (batch_size, num_hiddens) enc_outputs, hidden_state, enc_valid_lens = state # Shape of the output X: (num_steps, batch_size, embed_size) X = self.embedding(X) # Input X has shape: (batch_size, num_steps) X = tf.transpose(X, perm=(1, 0, 2)) outputs, self._attention_weights = [], [] for x in X: # Shape of query: (batch_size, 1, num_hiddens) query = tf.expand_dims(hidden_state[-1], axis=1) # Shape of context: (batch_size, 1, num_hiddens) context = self.attention(query, enc_outputs, enc_outputs, enc_valid_lens, **kwargs) # Concatenate on the feature dimension x = tf.concat((context, tf.expand_dims(x, axis=1)), axis=-1) out = self.rnn(x, hidden_state, **kwargs) hidden_state = out[1:] outputs.append(out[0]) self._attention_weights.append(self.attention.attention_weights) # After fully connected layer transformation, shape of outputs: # (batch_size, num_steps, vocab_size) outputs = self.dense(tf.concat(outputs, axis=1)) return outputs, [enc_outputs, hidden_state, enc_valid_lens] @property def attention_weights(self): return self._attention_weights .. raw:: html

.. raw:: html

pytorch mxnet jax tensorflow

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python vocab_size, embed_size, num_hiddens, num_layers = 10, 8, 16, 2 batch_size, num_steps = 4, 7 encoder = d2l.Seq2SeqEncoder(vocab_size, embed_size, num_hiddens, num_layers) decoder = Seq2SeqAttentionDecoder(vocab_size, embed_size, num_hiddens, num_layers) X = torch.zeros((batch_size, num_steps), dtype=torch.long) state = decoder.init_state(encoder(X), None) output, state = decoder(X, state) d2l.check_shape(output, (batch_size, num_steps, vocab_size)) d2l.check_shape(state[0], (batch_size, num_steps, num_hiddens)) d2l.check_shape(state[1][0], (batch_size, num_hiddens)) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python vocab_size, embed_size, num_hiddens, num_layers = 10, 8, 16, 2 batch_size, num_steps = 4, 7 encoder = d2l.Seq2SeqEncoder(vocab_size, embed_size, num_hiddens, num_layers) decoder = Seq2SeqAttentionDecoder(vocab_size, embed_size, num_hiddens, num_layers) X = np.zeros((batch_size, num_steps)) state = decoder.init_state(encoder(X), None) output, state = decoder(X, state) d2l.check_shape(output, (batch_size, num_steps, vocab_size)) d2l.check_shape(state[0], (batch_size, num_steps, num_hiddens)) d2l.check_shape(state[1][0], (batch_size, num_hiddens)) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output [22:45:30] ../src/storage/storage.cc:196: Using Pooled (Naive) StorageManager for CPU .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python vocab_size, embed_size, num_hiddens, num_layers = 10, 8, 16, 2 batch_size, num_steps = 4, 7 encoder = d2l.Seq2SeqEncoder(vocab_size, embed_size, num_hiddens, num_layers) decoder = Seq2SeqAttentionDecoder(vocab_size, embed_size, num_hiddens, num_layers) X = jnp.zeros((batch_size, num_steps), dtype=jnp.int32) state = decoder.init_state(encoder.init_with_output(d2l.get_key(), X, training=False)[0], None) (output, state), _ = decoder.init_with_output(d2l.get_key(), X, state, training=False) d2l.check_shape(output, (batch_size, num_steps, vocab_size)) d2l.check_shape(state[0], (batch_size, num_steps, num_hiddens)) d2l.check_shape(state[1][0], (batch_size, num_hiddens)) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python vocab_size, embed_size, num_hiddens, num_layers = 10, 8, 16, 2 batch_size, num_steps = 4, 7 encoder = d2l.Seq2SeqEncoder(vocab_size, embed_size, num_hiddens, num_layers) decoder = Seq2SeqAttentionDecoder(vocab_size, embed_size, num_hiddens, num_layers) X = tf.zeros((batch_size, num_steps)) state = decoder.init_state(encoder(X, training=False), None) output, state = decoder(X, state, training=False) d2l.check_shape(output, (batch_size, num_steps, vocab_size)) d2l.check_shape(state[0], (batch_size, num_steps, num_hiddens)) d2l.check_shape(state[1][0], (batch_size, num_hiddens)) .. raw:: html

.. raw:: html

pytorch mxnet jax tensorflow

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python data = d2l.MTFraEng(batch_size=128) embed_size, num_hiddens, num_layers, dropout = 256, 256, 2, 0.2 encoder = d2l.Seq2SeqEncoder( len(data.src_vocab), embed_size, num_hiddens, num_layers, dropout) decoder = Seq2SeqAttentionDecoder( len(data.tgt_vocab), embed_size, num_hiddens, num_layers, dropout) model = d2l.Seq2Seq(encoder, decoder, tgt_pad=data.tgt_vocab[''], lr=0.005) trainer = d2l.Trainer(max_epochs=30, gradient_clip_val=1, num_gpus=1) trainer.fit(model, data) .. figure:: output_bahdanau-attention_dd4710_60_0.svg .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python data = d2l.MTFraEng(batch_size=128) embed_size, num_hiddens, num_layers, dropout = 256, 256, 2, 0.2 encoder = d2l.Seq2SeqEncoder( len(data.src_vocab), embed_size, num_hiddens, num_layers, dropout) decoder = Seq2SeqAttentionDecoder( len(data.tgt_vocab), embed_size, num_hiddens, num_layers, dropout) model = d2l.Seq2Seq(encoder, decoder, tgt_pad=data.tgt_vocab[''], lr=0.005, training=True) trainer = d2l.Trainer(max_epochs=30, gradient_clip_val=1, num_gpus=1) trainer.fit(model, data) .. figure:: output_bahdanau-attention_dd4710_66_0.svg .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python data = d2l.MTFraEng(batch_size=128) embed_size, num_hiddens, num_layers, dropout = 256, 256, 2, 0.2 with d2l.try_gpu(): encoder = d2l.Seq2SeqEncoder( len(data.src_vocab), embed_size, num_hiddens, num_layers, dropout) decoder = Seq2SeqAttentionDecoder( len(data.tgt_vocab), embed_size, num_hiddens, num_layers, dropout) model = d2l.Seq2Seq(encoder, decoder, tgt_pad=data.tgt_vocab[''], lr=0.005) trainer = d2l.Trainer(max_epochs=30, gradient_clip_val=1) trainer.fit(model, data) .. figure:: output_bahdanau-attention_dd4710_69_0.svg .. raw:: html

.. raw:: html

pytorch mxnet jax tensorflow

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python engs = ['go .', 'i lost .', 'he\'s calm .', 'i\'m home .'] fras = ['va !', 'j\'ai perdu .', 'il est calme .', 'je suis chez moi .'] preds, _ = model.predict_step( data.build(engs, fras), d2l.try_gpu(), data.num_steps) for en, fr, p in zip(engs, fras, preds): translation = [] for token in data.tgt_vocab.to_tokens(p): if token == '': break translation.append(token) print(f'{en} => {translation}, bleu,' f'{d2l.bleu(" ".join(translation), fr, k=2):.3f}') .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output go . => ['va', '!'], bleu,1.000 i lost . => ["j'ai", 'perdu', '.'], bleu,1.000 he's calm . => ['il', 'court', '.'], bleu,0.000 i'm home . => ['je', 'suis', 'chez', 'moi', '.'], bleu,1.000 .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python engs = ['go .', 'i lost .', 'he\'s calm .', 'i\'m home .'] fras = ['va !', 'j\'ai perdu .', 'il est calme .', 'je suis chez moi .'] preds, _ = model.predict_step( data.build(engs, fras), d2l.try_gpu(), data.num_steps) for en, fr, p in zip(engs, fras, preds): translation = [] for token in data.tgt_vocab.to_tokens(p): if token == '': break translation.append(token) print(f'{en} => {translation}, bleu,' f'{d2l.bleu(" ".join(translation), fr, k=2):.3f}') .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output go . => ['', '!'], bleu,0.000 i lost . => ["j'ai", 'perdu', '.'], bleu,1.000 he's calm . => ['il', 'court', '.'], bleu,0.000 i'm home . => ['je', 'suis', 'certain', '.'], bleu,0.512 .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python engs = ['go .', 'i lost .', 'he\'s calm .', 'i\'m home .'] fras = ['va !', 'j\'ai perdu .', 'il est calme .', 'je suis chez moi .'] preds, _ = model.predict_step( trainer.state.params, data.build(engs, fras), data.num_steps) for en, fr, p in zip(engs, fras, preds): translation = [] for token in data.tgt_vocab.to_tokens(p): if token == '': break translation.append(token) print(f'{en} => {translation}, bleu,' f'{d2l.bleu(" ".join(translation), fr, k=2):.3f}') .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output go . => ['', '!'], bleu,0.000 i lost . => ["j'ai", 'perdu', '.'], bleu,1.000 he's calm . => ['il', 'court', '.'], bleu,0.000 i'm home . => ['je', 'suis', 'chez', 'moi', '.'], bleu,1.000 .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python engs = ['go .', 'i lost .', 'he\'s calm .', 'i\'m home .'] fras = ['va !', 'j\'ai perdu .', 'il est calme .', 'je suis chez moi .'] preds, _ = model.predict_step( data.build(engs, fras), d2l.try_gpu(), data.num_steps) for en, fr, p in zip(engs, fras, preds): translation = [] for token in data.tgt_vocab.to_tokens(p): if token == '': break translation.append(token) print(f'{en} => {translation}, bleu,' f'{d2l.bleu(" ".join(translation), fr, k=2):.3f}') .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output go . => ['', 'à', 'rouler', '!'], bleu,0.000 i lost . => ['je', 'suis', 'bien', '.'], bleu,0.000 he's calm . => ['il', 'a', 'gagné', '.'], bleu,0.000 i'm home . => ['je', 'suis', '', '.'], bleu,0.512 .. raw:: html

.. raw:: html

pytorch mxnet jax tensorflow

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python _, dec_attention_weights = model.predict_step( data.build([engs[-1]], [fras[-1]]), d2l.try_gpu(), data.num_steps, True) attention_weights = torch.cat( [step[0][0][0] for step in dec_attention_weights], 0) attention_weights = attention_weights.reshape((1, 1, -1, data.num_steps)) # Plus one to include the end-of-sequence token d2l.show_heatmaps( attention_weights[:, :, :, :len(engs[-1].split()) + 1].cpu(), xlabel='Key positions', ylabel='Query positions') .. figure:: output_bahdanau-attention_dd4710_90_0.svg .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python _, dec_attention_weights = model.predict_step( data.build([engs[-1]], [fras[-1]]), d2l.try_gpu(), data.num_steps, True) attention_weights = np.concatenate( [step[0][0][0] for step in dec_attention_weights], 0) attention_weights = attention_weights.reshape((1, 1, -1, data.num_steps)) # Plus one to include the end-of-sequence token d2l.show_heatmaps( attention_weights[:, :, :, :len(engs[-1].split()) + 1], xlabel='Key positions', ylabel='Query positions') .. figure:: output_bahdanau-attention_dd4710_93_0.svg .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python _, (dec_attention_weights, _) = model.predict_step( trainer.state.params, data.build([engs[-1]], [fras[-1]]), data.num_steps, True) attention_weights = jnp.concatenate( [step[0][0][0] for step in dec_attention_weights], 0) attention_weights = attention_weights.reshape((1, 1, -1, data.num_steps)) # Plus one to include the end-of-sequence token d2l.show_heatmaps(attention_weights[:, :, :, :len(engs[-1].split()) + 1], xlabel='Key positions', ylabel='Query positions') .. figure:: output_bahdanau-attention_dd4710_96_0.svg .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python _, dec_attention_weights = model.predict_step( data.build([engs[-1]], [fras[-1]]), d2l.try_gpu(), data.num_steps, True) attention_weights = tf.concat( [step[0][0][0] for step in dec_attention_weights], 0) attention_weights = tf.reshape(attention_weights, (1, 1, -1, data.num_steps)) # Plus one to include the end-of-sequence token d2l.show_heatmaps(attention_weights[:, :, :, :len(engs[-1].split()) + 1], xlabel='Key positions', ylabel='Query positions') .. figure:: output_bahdanau-attention_dd4710_99_0.svg .. raw:: html

.. raw:: html