And finally, turn it into a TensorFlow dataset for the final preprocessing; splitting the text by letter, one-hot encoding it and padding it, so that they're all the same length (+ batch it into 32s):
Defining the Model
Now, let's start making the model
To start, we need an initialization method for the patterns (tokens) of the tokenization layer. Here we'll use tokenization_layer.PatternsInitializerMaxCover.
And then, we'll define our model (we'll use the subclassing API, but the other keras APIs also work):
Making the Training Loop
For the final thing in this example, we'll make a custom training loop for our model. Note you don't have to do this, model.compile()model.fit() also works.
Our training loop will save model checkpoints, as well as info on how the patterns and gradients evolved. We initialize those things here:
Lastly, here's the acutal training loop iself:
There it is! Every 10 steps, this training loop will decode all the patterns (i.e. tokens) in the tokenization layer, save them to patterns_log.txt, and display the top patterns with the most non-zero and zero gradients (which is an indicator of convergance). It also saves the mean and standard deviation of values and gradients along the whole model, in grads_log.csv and vals_log.csv respectively.
one_hot = lambda x: tf.cast(tokenization_layer.one_hot_str(x, chars), tf.float32)
# Clip and pad all reviews to be 2000 characters long
def clip_and_pad(x):
output_length = 2000
shape = tf.shape(x)
if shape[1] >= output_length:
return x[:, :output_length]
else:
return tf.concat([x, tf.zeros((shape[0], output_length-shape[1]))], axis=1)
# Convert to TF Datasets and preprocess
X_train, X_val, X_test = tf.data.Dataset.from_tensor_slices(X_train), tf.data.Dataset.from_tensor_slices(X_val), tf.data.Dataset.from_tensor_slices(X_test)
X_train, X_val, X_test = X_train.map(one_hot).map(clip_and_pad), X_val.map(one_hot).map(clip_and_pad), X_test.map(one_hot).map(clip_and_pad)
y_train, y_val, y_test = tf.data.Dataset.from_tensor_slices(np.asarray(y_train).astype('float32')), tf.data.Dataset.from_tensor_slices(np.asarray(y_val).astype('float32')), tf.data.Dataset.from_tensor_slices(np.asarray(y_test).astype('float32'))
# Merge X and ys into one TF Dataset
train_set, val_set, test_set = tf.data.Dataset.zip((X_train, y_train)), tf.data.Dataset.zip((X_val, y_val)), tf.data.Dataset.zip((X_test, y_test))
for item in train_set.take(3):
print(item)
# Shuffle, batch and prefetch data
train_set = train_set.shuffle(buffer_size=1000, seed=42, reshuffle_each_iteration=False) \
.batch(32, drop_remainder=True).prefetch(1)
val_set = val_set.shuffle(buffer_size=1000, seed=42, reshuffle_each_iteration=False) \
.batch(32, drop_remainder=True).prefetch(1)
test_set = test_set.shuffle(buffer_size=1000, seed=42, reshuffle_each_iteration=False) \
.batch(32, drop_remainder=True).prefetch(1)
# Add extra dimension so that the shape is `(batch_size, num_chars, text_len 1)` (i.e. what the tokenization layer wants):
train_set = train_set.map(lambda x, y: (tf.expand_dims(x, 3), y))
val_set = val_set.map(lambda x, y: (tf.expand_dims(x, 3), y))
test_set = test_set.map(lambda x, y: (tf.expand_dims(x, 3), y))
corpus = " ".join(data["review"].to_list())[:10000000] # If the corpus is too large, we'll run into RAM issues
patterns_init = tokenization_layer.PatternsInitilizerMaxCover(corpus, chars)
class ModelTokenization(tf.keras.Model):
def __init__(self):
super(ModelTokenization, self).__init__(name='')
self.tokenization = tokenization_layer.TokenizationLayer(n_neurons=500, initializer=patterns_init,
pattern_lens=max(patterns_init.gram_lens))
# Process the output of the tokenization layer so that it's digestible to the Embedding layer
self.lambda1 = keras.layers.Lambda(lambda x: tf.transpose(tf.squeeze(x, 3), [0, 2, 1]))
# We only need an embedding length of 1 because the rest of the network is just fully connected...
self.embedding = tokenization_layer.EmbeddingLayer(embedding_length=1)
# Flatten the embedded text so that the dense layers can process it
self.flatten = keras.layers.Flatten()
self.batch_norm1 = keras.layers.BatchNormalization()
self.dense = keras.layers.Dense(64)
self.out = keras.layers.Dense(1, activation="sigmoid")
def call(self, input_tensor, return_intermediates=False, training=False):
tokenization_out = self.tokenization(input_tensor, training=training)
lambda1_out = self.lambda1(tokenization_out, training=training)
embedding_out = self.embedding(lambda1_out, training=training)
flatten_out = self.flatten(embedding_out)
batch_norm1_out = self.batch_norm1(flatten_out, training=training)
dense_out = self.dense(batch_norm1_out, training=training)
out = self.out(dense_out, training=training)
if return_intermediates:
return out, dense_out, flatten_out, embedding_out, lambda1_out, tokenization_out
else:
return out
model = ModelTokenization()
_ = model(tf.zeros([32, 31, 2000, 1]))
model.summary()