.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python class LSTMScratch(d2l.Module): def __init__(self, num_inputs, num_hiddens, sigma=0.01): super().__init__() self.save_hyperparameters() init_weight = lambda *shape: nn.Parameter(torch.randn(*shape) * sigma) triple = lambda: (init_weight(num_inputs, num_hiddens), init_weight(num_hiddens, num_hiddens), nn.Parameter(torch.zeros(num_hiddens))) self.W_xi, self.W_hi, self.b_i = triple() # Input gate self.W_xf, self.W_hf, self.b_f = triple() # Forget gate self.W_xo, self.W_ho, self.b_o = triple() # Output gate self.W_xc, self.W_hc, self.b_c = triple() # Input node The actual model is defined as described above, consisting of three gates and an input node. Note that only the hidden state is passed to the output layer. .. raw:: latex \diilbookstyleinputcell .. code:: python @d2l.add_to_class(LSTMScratch) def forward(self, inputs, H_C=None): if H_C is None: # Initial state with shape: (batch_size, num_hiddens) H = torch.zeros((inputs.shape[1], self.num_hiddens), device=inputs.device) C = torch.zeros((inputs.shape[1], self.num_hiddens), device=inputs.device) else: H, C = H_C outputs = [] for X in inputs: I = torch.sigmoid(torch.matmul(X, self.W_xi) + torch.matmul(H, self.W_hi) + self.b_i) F = torch.sigmoid(torch.matmul(X, self.W_xf) + torch.matmul(H, self.W_hf) + self.b_f) O = torch.sigmoid(torch.matmul(X, self.W_xo) + torch.matmul(H, self.W_ho) + self.b_o) C_tilde = torch.tanh(torch.matmul(X, self.W_xc) + torch.matmul(H, self.W_hc) + self.b_c) C = F * C + I * C_tilde H = O * torch.tanh(C) outputs.append(H) return outputs, (H, C) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python class LSTMScratch(d2l.Module): def __init__(self, num_inputs, num_hiddens, sigma=0.01): super().__init__() self.save_hyperparameters() init_weight = lambda *shape: np.random.randn(*shape) * sigma triple = lambda: (init_weight(num_inputs, num_hiddens), init_weight(num_hiddens, num_hiddens), np.zeros(num_hiddens)) self.W_xi, self.W_hi, self.b_i = triple() # Input gate self.W_xf, self.W_hf, self.b_f = triple() # Forget gate self.W_xo, self.W_ho, self.b_o = triple() # Output gate self.W_xc, self.W_hc, self.b_c = triple() # Input node The actual model is defined as described above, consisting of three gates and an input node. Note that only the hidden state is passed to the output layer. .. raw:: latex \diilbookstyleinputcell .. code:: python @d2l.add_to_class(LSTMScratch) def forward(self, inputs, H_C=None): if H_C is None: # Initial state with shape: (batch_size, num_hiddens) H = np.zeros((inputs.shape[1], self.num_hiddens), ctx=inputs.ctx) C = np.zeros((inputs.shape[1], self.num_hiddens), ctx=inputs.ctx) else: H, C = H_C outputs = [] for X in inputs: I = npx.sigmoid(np.dot(X, self.W_xi) + np.dot(H, self.W_hi) + self.b_i) F = npx.sigmoid(np.dot(X, self.W_xf) + np.dot(H, self.W_hf) + self.b_f) O = npx.sigmoid(np.dot(X, self.W_xo) + np.dot(H, self.W_ho) + self.b_o) C_tilde = np.tanh(np.dot(X, self.W_xc) + np.dot(H, self.W_hc) + self.b_c) C = F * C + I * C_tilde H = O * np.tanh(C) outputs.append(H) return outputs, (H, C) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python class LSTMScratch(d2l.Module): num_inputs: int num_hiddens: int sigma: float = 0.01 def setup(self): init_weight = lambda name, shape: self.param(name, nn.initializers.normal(self.sigma), shape) triple = lambda name : ( init_weight(f'W_x{name}', (self.num_inputs, self.num_hiddens)), init_weight(f'W_h{name}', (self.num_hiddens, self.num_hiddens)), self.param(f'b_{name}', nn.initializers.zeros, (self.num_hiddens))) self.W_xi, self.W_hi, self.b_i = triple('i') # Input gate self.W_xf, self.W_hf, self.b_f = triple('f') # Forget gate self.W_xo, self.W_ho, self.b_o = triple('o') # Output gate self.W_xc, self.W_hc, self.b_c = triple('c') # Input node The actual model is defined as described above, consisting of three gates and an input node. Note that only the hidden state is passed to the output layer. A long for-loop in the ``forward`` method will result in an extremely long JIT compilation time for the first run. As a solution to this, instead of using a for-loop to update the state with every time step, JAX has ``jax.lax.scan`` utility transformation to achieve the same behavior. It takes in an initial state called ``carry`` and an ``inputs`` array which is scanned on its leading axis. The ``scan`` transformation ultimately returns the final state and the stacked outputs as expected. .. raw:: latex \diilbookstyleinputcell .. code:: python @d2l.add_to_class(LSTMScratch) def forward(self, inputs, H_C=None): # Use lax.scan primitive instead of looping over the # inputs, since scan saves time in jit compilation. def scan_fn(carry, X): H, C = carry I = jax.nn.sigmoid(jnp.matmul(X, self.W_xi) + ( jnp.matmul(H, self.W_hi)) + self.b_i) F = jax.nn.sigmoid(jnp.matmul(X, self.W_xf) + jnp.matmul(H, self.W_hf) + self.b_f) O = jax.nn.sigmoid(jnp.matmul(X, self.W_xo) + jnp.matmul(H, self.W_ho) + self.b_o) C_tilde = jnp.tanh(jnp.matmul(X, self.W_xc) + jnp.matmul(H, self.W_hc) + self.b_c) C = F * C + I * C_tilde H = O * jnp.tanh(C) return (H, C), H # return carry, y if H_C is None: batch_size = inputs.shape[1] carry = jnp.zeros((batch_size, self.num_hiddens)), \ jnp.zeros((batch_size, self.num_hiddens)) else: carry = H_C # scan takes the scan_fn, initial carry state, xs with leading axis to be scanned carry, outputs = jax.lax.scan(scan_fn, carry, inputs) return outputs, carry .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python class LSTMScratch(d2l.Module): def __init__(self, num_inputs, num_hiddens, sigma=0.01): super().__init__() self.save_hyperparameters() init_weight = lambda *shape: tf.Variable(tf.random.normal(shape) * sigma) triple = lambda: (init_weight(num_inputs, num_hiddens), init_weight(num_hiddens, num_hiddens), tf.Variable(tf.zeros(num_hiddens))) self.W_xi, self.W_hi, self.b_i = triple() # Input gate self.W_xf, self.W_hf, self.b_f = triple() # Forget gate self.W_xo, self.W_ho, self.b_o = triple() # Output gate self.W_xc, self.W_hc, self.b_c = triple() # Input node The actual model is defined as described above, consisting of three gates and an input node. Note that only the hidden state is passed to the output layer. .. raw:: latex \diilbookstyleinputcell .. code:: python @d2l.add_to_class(LSTMScratch) def forward(self, inputs, H_C=None): if H_C is None: # Initial state with shape: (batch_size, num_hiddens) H = tf.zeros((inputs.shape[1], self.num_hiddens)) C = tf.zeros((inputs.shape[1], self.num_hiddens)) else: H, C = H_C outputs = [] for X in inputs: I = tf.sigmoid(tf.matmul(X, self.W_xi) + tf.matmul(H, self.W_hi) + self.b_i) F = tf.sigmoid(tf.matmul(X, self.W_xf) + tf.matmul(H, self.W_hf) + self.b_f) O = tf.sigmoid(tf.matmul(X, self.W_xo) + tf.matmul(H, self.W_ho) + self.b_o) C_tilde = tf.tanh(tf.matmul(X, self.W_xc) + tf.matmul(H, self.W_hc) + self.b_c) C = F * C + I * C_tilde H = O * tf.tanh(C) outputs.append(H) return outputs, (H, C) .. raw:: html

.. raw:: html

pytorch mxnet jax tensorflow

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python data = d2l.TimeMachine(batch_size=1024, num_steps=32) lstm = LSTMScratch(num_inputs=len(data.vocab), num_hiddens=32) model = d2l.RNNLMScratch(lstm, vocab_size=len(data.vocab), lr=4) trainer = d2l.Trainer(max_epochs=50, gradient_clip_val=1, num_gpus=1) trainer.fit(model, data) .. figure:: output_lstm_86eb9f_41_0.svg .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python data = d2l.TimeMachine(batch_size=1024, num_steps=32) with d2l.try_gpu(): lstm = LSTMScratch(num_inputs=len(data.vocab), num_hiddens=32) model = d2l.RNNLMScratch(lstm, vocab_size=len(data.vocab), lr=4) trainer = d2l.Trainer(max_epochs=50, gradient_clip_val=1) trainer.fit(model, data) .. figure:: output_lstm_86eb9f_50_0.svg .. raw:: html

.. raw:: html

pytorch mxnet jax tensorflow

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python class LSTM(d2l.RNN): def __init__(self, num_inputs, num_hiddens): d2l.Module.__init__(self) self.save_hyperparameters() self.rnn = nn.LSTM(num_inputs, num_hiddens) def forward(self, inputs, H_C=None): return self.rnn(inputs, H_C) lstm = LSTM(num_inputs=len(data.vocab), num_hiddens=32) model = d2l.RNNLM(lstm, vocab_size=len(data.vocab), lr=4) trainer.fit(model, data) .. figure:: output_lstm_86eb9f_56_0.svg .. raw:: latex \diilbookstyleinputcell .. code:: python model.predict('it has', 20, data.vocab, d2l.try_gpu()) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output 'it has a the time travelly' .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python class LSTM(d2l.RNN): def __init__(self, num_hiddens): d2l.Module.__init__(self) self.save_hyperparameters() self.rnn = rnn.LSTM(num_hiddens) def forward(self, inputs, H_C=None): if H_C is None: H_C = self.rnn.begin_state( inputs.shape[1], ctx=inputs.ctx) return self.rnn(inputs, H_C) lstm = LSTM(num_hiddens=32) model = d2l.RNNLM(lstm, vocab_size=len(data.vocab), lr=4) trainer.fit(model, data) .. figure:: output_lstm_86eb9f_60_0.svg .. raw:: latex \diilbookstyleinputcell .. code:: python model.predict('it has', 20, data.vocab, d2l.try_gpu()) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output 'it has all the time travel' .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python class LSTM(d2l.RNN): num_hiddens: int @nn.compact def __call__(self, inputs, H_C=None, training=False): if H_C is None: batch_size = inputs.shape[1] H_C = nn.OptimizedLSTMCell.initialize_carry(jax.random.PRNGKey(0), (batch_size,), self.num_hiddens) LSTM = nn.scan(nn.OptimizedLSTMCell, variable_broadcast="params", in_axes=0, out_axes=0, split_rngs={"params": False}) H_C, outputs = LSTM()(H_C, inputs) return outputs, H_C lstm = LSTM(num_hiddens=32) model = d2l.RNNLM(lstm, vocab_size=len(data.vocab), lr=4) trainer.fit(model, data) .. figure:: output_lstm_86eb9f_64_0.svg .. raw:: latex \diilbookstyleinputcell .. code:: python model.predict('it has', 20, data.vocab, trainer.state.params) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output 'it has and the pered han a' .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python class LSTM(d2l.RNN): def __init__(self, num_hiddens): d2l.Module.__init__(self) self.save_hyperparameters() self.rnn = tf.keras.layers.LSTM( num_hiddens, return_sequences=True, return_state=True, time_major=True) def forward(self, inputs, H_C=None): outputs, *H_C = self.rnn(inputs, H_C) return outputs, H_C lstm = LSTM(num_hiddens=32) with d2l.try_gpu(): model = d2l.RNNLM(lstm, vocab_size=len(data.vocab), lr=4) trainer.fit(model, data) .. figure:: output_lstm_86eb9f_68_0.svg .. raw:: latex \diilbookstyleinputcell .. code:: python model.predict('it has', 20, data.vocab) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output 'it has a dimension a dimen' .. raw:: html

.. raw:: html