'How to properly initialize TensorFlow GRU-layer with noisy states?
I wish to experiement with noisy GRU states instead of resetting them to zero for each batch. I try below an implementation. My initial code was resetting initial states to zero with (states = None)
, I changed the train_step
with
noisy_states = tf.convert_to_tensor(np.random.random([BATCH_SIZE, RNN_UNITS]).astype(np.float32))
predictions, states = self(inputs, states=noisy_states, return_state=True, training=True)
The model class inherited from Tensorflow Model now looks like
class MyModel(tf.keras.Model):
def __init__(self, vocab_size, embedding_dim, rnn_units):
super().__init__(self)
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
self.gru = tf.keras.layers.GRU(rnn_units,
stateful=True,
return_sequences=True,
return_state=True,
activation='tanh',
recurrent_activation='sigmoid',
recurrent_dropout=0.2,
dropout=0.2,
reset_after=True
)
self.dense = tf.keras.layers.Dense(vocab_size)
def call(self, inputs, states=None, return_state=False, training=False):
x = inputs
x = self.embedding(x, training=training)
if states is None:
states = self.gru.get_initial_state(x)
x, states = self.gru(x, initial_state=states, training=training)
x = self.dense(x, training=training)
if return_state:
return x, states
else:
return x
@tf.function
def train_step(self, inputs):
inputs, labels = inputs
with tf.GradientTape() as tape:
noisy_states = tf.convert_to_tensor(np.random.random([BATCH_SIZE, RNN_UNITS]).astype(np.float32))
predictions, states = self(inputs, states=noisy_states, return_state=True, training=True)
loss=self.compiled_loss(labels, predictions, regularization_losses=self.losses)
grads=tape.gradient(loss, model.trainable_variables)
self.optimizer.apply_gradients(zip(grads, model.trainable_variables))
self.compiled_metrics.update_state(labels, predictions)
return {m.name: m.result() for m in self.metrics}
Training runs with no error, but inference fails with
ValueError: in user code:
train.py:239 generate_one_step *
predicted_logits, states = self.model(inputs=input_ids, states=states,
train-v4.py:133 call *
x, states = self.gru(x, initial_state=states, training=training)
/usr/local/lib/python3.6/dist-packages/keras/layers/recurrent.py:716 __call__ **
return super(RNN, self).__call__(inputs, **kwargs)
[...]
ValueError: Input 0 is incompatible with layer gru: expected shape=(64, None, 256), found shape=(1, None, 256)
The generator looks like this
class OneStep(tf.keras.Model):
def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
super().__init__()
self.temperature = temperature
self.model = model
[initialize stuff]
[...]
@tf.function
def generate_one_step(self, inputs, states=None):
input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
input_ids = self.ids_from_chars(input_chars).to_tensor()
predicted_logits, states = self.model(inputs=input_ids, states=states,
return_state=True)
predicted_logits = predicted_logits[:, -1, :]
predicted_logits = predicted_logits/self.temperature
predicted_logits = predicted_logits + self.prediction_mask
predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
predicted_ids = tf.squeeze(predicted_ids, axis=-1)
predicted_chars = self.chars_from_ids(predicted_ids)
return predicted_chars, states
and the code throwing the error is
for n in range(10000):
next_char, states = one_step_model.generate_one_step(next_char, states=states)
result.append(next_char)
In my understanding, we initialize states with some noise instead of zeroes to avoid overfitting. Model is better trained as before, and weights saved for inference. Should the inference model be changed as well, should the states behavior be updated in the generator too?
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|