IMDB Reviews

An example of using this package to make a model (with the tokenization layer) and train it on the IMDB Reviews Dataset.

!pip install tokenization-layer
import tokenization_layer

import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import re
import string

Getting & Preparing the Data

Firstly, let's get the data and prepare it. I've the dataset on my Google Drive, so we can download it from there:

import requests
from io import StringIO
orig_url = 'https://drive.google.com/file/d/1-4wZ3VawRfxvX9taPhfHWU7mAiH-gBDe/view?usp=sharing'
file_id = orig_url.split('/')[-2]
dwn_url='https://drive.google.com/uc?export=download&id=' + file_id
url = requests.get(dwn_url).text
csv_raw = StringIO(url)
data = pd.read_csv(csv_raw)

Then, some basic preprocessing:

data = data.sample(len(data)).reset_index(drop=True)

# Strip "<br />" tags and convert to lowercase
data["review"] = data["review"].apply(lambda x: x.replace("<br />", " ").lower())
# Strip punctuation
data["review"] = data["review"].apply(lambda x: re.sub(f"[{string.punctuation}]", "", x))
# Get top 30 most common characters
chars = "".join(pd.Series(list(" ".join(data["review"].to_list()))).value_counts().keys()[:30])
# Remove everything except the top 30 most common characters
data["review"] = data["review"].apply(lambda x: re.sub(f"[^{chars}]", "", x))

from sklearn.preprocessing import OrdinalEncoder
data[["sentiment"]] = OrdinalEncoder().fit_transform(data[["sentiment"]])

print(data.head())

Do a train-test-validation split:

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(data["review"], data["sentiment"], test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42)

And finally, turn it into a TensorFlow dataset for the final preprocessing; splitting the text by letter, one-hot encoding it and padding it, so that they're all the same length (+ batch it into 32s):

one_hot = lambda x: tf.cast(tokenization_layer.one_hot_str(x, chars), tf.float32)
# Clip and pad all reviews to be 2000 characters long
def clip_and_pad(x):
    output_length = 2000
    shape = tf.shape(x)
    if shape[1] >= output_length:
        return x[:, :output_length]
    else:
        return tf.concat([x, tf.zeros((shape[0], output_length-shape[1]))], axis=1)

# Convert to TF Datasets and preprocess
X_train, X_val, X_test = tf.data.Dataset.from_tensor_slices(X_train), tf.data.Dataset.from_tensor_slices(X_val), tf.data.Dataset.from_tensor_slices(X_test)
X_train, X_val, X_test = X_train.map(one_hot).map(clip_and_pad), X_val.map(one_hot).map(clip_and_pad), X_test.map(one_hot).map(clip_and_pad)

y_train, y_val, y_test = tf.data.Dataset.from_tensor_slices(np.asarray(y_train).astype('float32')), tf.data.Dataset.from_tensor_slices(np.asarray(y_val).astype('float32')), tf.data.Dataset.from_tensor_slices(np.asarray(y_test).astype('float32'))

# Merge X and ys into one TF Dataset
train_set, val_set, test_set = tf.data.Dataset.zip((X_train, y_train)), tf.data.Dataset.zip((X_val, y_val)), tf.data.Dataset.zip((X_test, y_test))
for item in train_set.take(3):
    print(item)

# Shuffle, batch and prefetch data
train_set = train_set.shuffle(buffer_size=1000, seed=42, reshuffle_each_iteration=False) \
                     .batch(32, drop_remainder=True).prefetch(1)
val_set = val_set.shuffle(buffer_size=1000, seed=42, reshuffle_each_iteration=False) \
                     .batch(32, drop_remainder=True).prefetch(1)
test_set = test_set.shuffle(buffer_size=1000, seed=42, reshuffle_each_iteration=False) \
                     .batch(32, drop_remainder=True).prefetch(1)

# Add extra dimension so that the shape is `(batch_size, num_chars, text_len 1)` (i.e. what the tokenization layer wants):
train_set = train_set.map(lambda x, y: (tf.expand_dims(x, 3), y))
val_set = val_set.map(lambda x, y: (tf.expand_dims(x, 3), y))
test_set = test_set.map(lambda x, y: (tf.expand_dims(x, 3), y))

Defining the Model

Now, let's start making the model

To start, we need an initialization method for the patterns (tokens) of the tokenization layer. Here we'll use tokenization_layer.PatternsInitializerMaxCover.

corpus = " ".join(data["review"].to_list())[:10000000] # If the corpus is too large, we'll run into RAM issues
patterns_init = tokenization_layer.PatternsInitilizerMaxCover(corpus, chars)

And then, we'll define our model (we'll use the subclassing API, but the other keras APIs also work):

class ModelTokenization(tf.keras.Model):
    def __init__(self):
        super(ModelTokenization, self).__init__(name='')
        
        self.tokenization = tokenization_layer.TokenizationLayer(n_neurons=500, initializer=patterns_init, 
                                                                 pattern_lens=max(patterns_init.gram_lens))
        # Process the output of the tokenization layer so that it's digestible to the Embedding layer
        self.lambda1 = keras.layers.Lambda(lambda x: tf.transpose(tf.squeeze(x, 3), [0, 2, 1]))
        # We only need an embedding length of 1 because the rest of the network is just fully connected...
        self.embedding = tokenization_layer.EmbeddingLayer(embedding_length=1)
        # Flatten the embedded text so that the dense layers can process it
        self.flatten = keras.layers.Flatten()

        self.batch_norm1 = keras.layers.BatchNormalization()
        self.dense = keras.layers.Dense(64)
        self.out = keras.layers.Dense(1, activation="sigmoid")

    def call(self, input_tensor, return_intermediates=False, training=False):
        tokenization_out = self.tokenization(input_tensor, training=training)
        lambda1_out = self.lambda1(tokenization_out, training=training)
        embedding_out = self.embedding(lambda1_out, training=training)
        flatten_out = self.flatten(embedding_out)

        batch_norm1_out = self.batch_norm1(flatten_out, training=training)
        dense_out = self.dense(batch_norm1_out, training=training)
        out = self.out(dense_out, training=training)

        if return_intermediates:
            return out, dense_out, flatten_out, embedding_out, lambda1_out, tokenization_out
        else:
            return out
model = ModelTokenization()
_ = model(tf.zeros([32, 31, 2000, 1]))
model.summary()

Making the Training Loop

For the final thing in this example, we'll make a custom training loop for our model. Note you don't have to do this, model.compile() model.fit() also works.

optimizer = keras.optimizers.Adam()
loss_fn = keras.losses.BinaryCrossentropy()
train_acc_metric = keras.metrics.Accuracy()
val_acc_metric = keras.metrics.Accuracy()

Our training loop will save model checkpoints, as well as info on how the patterns and gradients evolved. We initialize those things here:

path = "model_checkpoints/"
with open(path+"patterns_log.txt", "a+") as f:
    pass
with open(path+"grads_log.csv", "a+") as f:
    f.write("Out Mean,Out Std,Dense Mean,Dense Std,Embedding Mean,Embedding Std,Tokenization Mean,"+\
            "Tokenization Std,Out Kernel Mean,Out Kernel Std,Out Bias Mean,Out Bias Std,"+\
            "Dense Kernel Mean,Dense Kernel Std,Dense Bias Mean,Dense Bias Std,"+\
            "Embedding Kernel Mean,Embedding Kernel Std,Patterns Mean,Patterns Std,\n")
with open(path+"vals_log.csv", "a+") as f:
    f.write("Out Mean,Out Std,Dense Mean,Dense Std,Embedding Mean,Embedding Std,Tokenization Mean,"+\
            "Tokenization Std,Out Kernel Mean,Out Kernel Std,Out Bias Mean,Out Bias Std,"+\
            "Dense Kernel Mean,Dense Kernel Std,Dense Bias Mean,Dense Bias Std,"+\
            "Embedding Kernel Mean,Embedding Kernel Std,Patterns Mean,Patterns Std,\n")

Lastly, here's the acutal training loop iself:

class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

# Note that these functions won't work if your `chars` includes and `"<UNK>"`.
char_lookup = tf.concat([tf.constant(["█"]), tf.strings.bytes_split(tf.constant(chars))], axis=0)
reverse_text = lambda x: tf.strings.join(tf.gather(char_lookup, tf.argmax(tf.concat([tf.fill([1, x.shape[1]], 0.5), x], axis=0), axis=0)))
epochs = 10
for epoch in range(epochs):
    train_loss_rounded, train_acc_rounded = 0, 0
    for step, (x_batch_train, y_batch_train) in enumerate(train_set):
        # -=-= COMPUTE GRADIENTS OF BATCH  =-=-
        with tf.GradientTape() as tape:
            z, dense_out, flatten_out, embedding_out, lambda1_out, tokenization_out = model(x_batch_train, return_intermediates=True, training=True)
            z = tf.squeeze(z, 1)
            loss = loss_fn(y_batch_train, z)
        layer_vals = [z, dense_out, embedding_out, tokenization_out]
        grads = tape.gradient(loss, layer_vals+model.trainable_variables)
        layer_grads = grads[:len(layer_vals)]
        grads = grads[len(layer_vals):]
        
        # -=-= LOG INGO  =-=-
        progress_bar_done = "".join(["█" for _ in range(round( step*20/len(train_set) ))])
        progress_bar_left = "".join([" " for _ in range(20-round( step*20/len(train_set) ))])
        percent_done = round(step*100/len(train_set), 2)

        save_patterns = False
        if step%10 == 0:
            save_patterns = True
            # Decode patterns
            patterns = model.tokenization.patterns
            patterns = tf.cast(tf.math.logical_and(
                patterns == tf.expand_dims(tf.reduce_max(patterns, axis=0), 0),
                tf.reduce_sum(patterns, axis=0) > 0
            ), tf.float32)
            patterns_decoded = [reverse_text(pattern).numpy().decode() for pattern in tf.transpose(tf.squeeze(patterns, 2), [2, 0, 1])]
            # Get patterns to log
            pattern_grads = tf.transpose(tf.squeeze(grads[0], 2), [2, 0, 1])
            pattern_grads_summary = tf.math.reduce_std(pattern_grads, axis=[1, 2])+tf.abs(tf.reduce_mean(pattern_grads, axis=[1, 2]))
            pattern_grads_sorted_indexes = list(pd.Series(pattern_grads_summary).sort_values().keys())

        clear_output(wait=True)
        print(f'Epoch {epoch+1}/{epochs} - |{progress_bar_done}{progress_bar_left}| - {percent_done}% - {step+1}/{len(train_set)}')
        print(f'Train loss: {train_loss_rounded} - Train accuracy: {train_acc_rounded}')
        print()
        # Log patterns
        top_n = 15
        buffer = "".join("0" for _ in range(7))

        patterns_log_high = [f'"{patterns_decoded[i]}": '+(str(pattern_grads_summary[i].numpy()*100)+buffer)[:7]+" | " 
                             for i in pattern_grads_sorted_indexes[-top_n:]]
        num_per_row = int(np.floor(135/len(patterns_log_high[0])))
        print(f"{color.BOLD}Patterns with diverse non-zero gradients{color.END}")
        for i in range(int(np.floor(len(patterns_log_high)/num_per_row))):
            print("".join(patterns_log_high[(i)*num_per_row:(i+1)*num_per_row]))
        if len(patterns_log_high)%num_per_row != 0:
            print("".join(patterns_log_high[-(int(np.floor(len(patterns_log_high)/num_per_row))*num_per_row)+1:]))

        patterns_log_low = [f'"{patterns_decoded[i]}": '+(str(pattern_grads_summary[i].numpy()*100)+buffer)[:7]+" | " 
                             for i in pattern_grads_sorted_indexes[:top_n]]
        num_per_row = int(np.floor(135/len(patterns_log_low[0])))
        print(f"{color.BOLD}Patterns with mostly zero gradients{color.END}")
        for i in range(int(np.floor(len(patterns_log_low)/num_per_row))):
            print("".join(patterns_log_low[(i)*num_per_row:(i+1)*num_per_row]))
        if len(patterns_log_low)%num_per_row != 0:
            print("".join(patterns_log_low[-(int(np.floor(len(patterns_log_low)/num_per_row))*num_per_row)+1:]))

        # -=-= UPDATE NETWORK & METRICS  =-=-
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        train_acc_metric.update_state(y_batch_train, tf.round(z))
        train_loss_rounded, train_acc_rounded = "%.4f" % loss.numpy(), "%.4f" % train_acc_metric.result().numpy()

        # -=-= SAVE THINGS  =-=-
        if (step%int(np.floor(len(train_set)/5))==0) and (step != 0):
            cp_num = len(os.listdir(path))-1
            model_cp = tf.train.Checkpoint(model=model)
            model_cp.write(path+f"model_cp_{cp_num}/model_checkpoint")
        if save_patterns:
            with open(path+"patterns_log.txt", "a+") as f:
                [f.write(f'"{pattern}", ') for pattern in patterns_decoded]
                f.write("\n")
        with open(path+"grads_log.csv", "a+") as f:
            for layer_index in [0, 1, 2, 3]:
                f.write(str( tf.reduce_mean(layer_grads[layer_index]).numpy() )+",")
                f.write(str( tf.math.reduce_std(tf.reduce_mean(layer_grads[layer_index], axis=0)).numpy() )+",")
            for param_index in [6, 7, 4, 5, 1, 0]:
                f.write(str( tf.reduce_mean(grads[param_index]).numpy() )+",")
                f.write(str( tf.math.reduce_std(grads[param_index]).numpy() )+",")
            f.write("\n")
        with open(path+"vals_log.csv", "a+") as f:
            for layer_index in [0, 1, 2, 3]:
                f.write(str( tf.reduce_mean(layer_vals[layer_index]).numpy() )+",")
                f.write(str( tf.math.reduce_std(tf.reduce_mean(layer_vals[layer_index], axis=0)).numpy() )+",")
            for param_index in [6, 7, 4, 5, 1, 0]:
                f.write(str( tf.reduce_mean(model.trainable_variables[param_index]).numpy() )+",")
                f.write(str( tf.math.reduce_std(model.trainable_variables[param_index]).numpy() )+",")
            f.write("\n")

    train_acc_metric.reset_states()

There it is! Every 10 steps, this training loop will decode all the patterns (i.e. tokens) in the tokenization layer, save them to patterns_log.txt, and display the top patterns with the most non-zero and zero gradients (which is an indicator of convergance). It also saves the mean and standard deviation of values and gradients along the whole model, in grads_log.csv and vals_log.csv respectively.

Last updated

Was this helpful?