Learning Attentional LSTM model

Learning Attentional LSTM models

A Brief about Embedding Layer Encoder Layer taking input from embedding
layer

Now Reading our Dataset

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
root_dir = "../input"
batch_size = 64
vocab_size = 30
maxlen = 30
nrows = 450000 # due to kaggle limit set this to the number of needed training  rows
library = pd.read_csv(os.path.join(root_dir,'library-collection-inventory.csv'),usecols=['Title','Author','Publisher','Subjects'])
library.dropna(inplace=True)
library = library.iloc[np.random.randint(0,library.shape[0], size=nrows)]

Now to the Important part of this tutorial Model Definition

Attention Layer Implementation

Why Attention ?

Attention * attention layer creates a context vector from the output of all previously encoded words while decoding the current word

from keras import initializers
from keras.engine import InputSpec, Layer
from keras import backend as K


class AttentionWeightedAverage(Layer):
    """
    Computes a weighted average of the different channels across timesteps.
    Uses 1 parameter pr. channel to compute the attention value for a single timestep.
    """

    def __init__(self, return_attention=False,**kwargs):
        self.init = initializers.get('uniform')
        self.supports_masking = True
        self.return_attention = return_attention
        super(AttentionWeightedAverage, self).__init__(** kwargs)

    def build(self, input_shape):
        self.input_spec = [InputSpec(ndim=3)]
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[2], 1),
                                 name='{}_W'.format(self.name),
                                 initializer=self.init)
        self.trainable_weights = [self.W]
        super(AttentionWeightedAverage, self).build(input_shape)

    def call(self, x, mask=None):
        # computes a probability distribution over the timesteps
        # uses 'max trick' for numerical stability
        # reshape is done to avoid issue with Tensorflow
        # and 1-dimensional weights
        logits = K.dot(x, self.W)
        x_shape = K.shape(x)
        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
        ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))

        # masked timesteps have zero weight
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            ai = ai* mask
        att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
        weighted_input = x* K.expand_dims(att_weights)
        result = K.sum(weighted_input, axis=1)
        if self.return_attention:
            return [result, att_weights]
        return result

    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)

    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return (input_shape[0], output_len)

    def compute_mask(self, input, input_mask=None):
        if isinstance(input_mask, list):
            return [None]* len(input_mask)
        else:
            return None

The Model Class

from keras.layers import  Embedding,GlobalMaxPool1D, GlobalMaxPooling1D,Dense,Input,Dropout,SpatialDropout1D,RepeatVector
from keras.models import Model
from keras.layers.recurrent import LSTM,GRU
from keras.callbacks import ModelCheckpoint,EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers.core import*
from sklearn.base import BaseEstimator, ClassifierMixin

#Sklearn class style to use grid search and other sklearn utilities with Keras model
class NeuralClassifier(BaseEstimator, ClassifierMixin): 
    def __init__(self,attention=False,type='lstm',embed_size=50,vocab_size=10000,nepochs=5,patience=4,batch_size=16,maxlen=100):
        self.attention = attention
        self.type = type
        self.vocab_size = vocab_size
        self.nepochs = nepochs
        self.patience = patience
        self.batch_size = batch_size
        self.maxlen = maxlen
        self.embed_size = embed_size
        self.file_path = 'autoencoder_'+self.type+'.best.hdf5'
        self.val_size = 0.3
        self.tokenizer = None
        self.autoencoder = None
        self.history = None
        
    def get_params(self, deep=True):
        return {"attention":self.attention,"type":self.type,"vocab_size":self.vocab_size,"nepochs":self.nepochs,\
                "patience":self.patience,"batch_size":self.batch_size,"maxlen":self.maxlen,"embed_size":self.embed_size}
    
    def set_param(self,**parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self 
    
    def init_model(self,out_dim=20):
        if 'lstm' in self.type:
            return self.lstm_model(out_dim)
        elif 'gru' in self.type:
            return self.gru_model(out_dim) 
            
    def lstm_model(self,out_dim):
        inp = Input(shape=(self.maxlen,) )
        embedding = Embedding(self.vocab_size, self.embed_size)(inp)
        encoder = LSTM(self.embed_size, return_sequences=True, dropout=0.1, recurrent_dropout=0.1 ,name='LSTM_encoder')(embedding)
        if self.attention:
            encoder = AttentionWeightedAverage()(encoder)
        else:
            encoder = GlobalMaxPool1D()(encoder)
        encoder = Dense(self.embed_size, activation="relu" )(encoder)
        
        decoded = RepeatVector(self.maxlen)(encoder)
        decoder = LSTM(self.embed_size, return_sequences=False, dropout=0.1, recurrent_dropout=0.1 ,name='LSTM_decoder')(decoded)
        decoder = Dense(out_dim, activation="softmax",name="decoder_out")(decoder)
        autoencoder = Model(inputs=inp,outputs=decoder)
        autoencoder.compile(optimizer='rmsprop', loss='categorical_crossentropy')
        return autoencoder
    
    def gru_model(self,out_dim):
        inp = Input(shape=(self.maxlen,) )
        embedding = Embedding(self.vocab_size, self.embed_size)(inp)
        embedding = SpatialDropout1D(0.1)(embedding)
        encoder = GRU(self.embed_size, return_sequences=True ,name='GRU_encoder')(embedding)
        if self.attention:
            encoder = AttentionWeightedAverage()(encoder)
        else:
            encoder = GlobalMaxPool1D()(encoder)
        encoder = Dense(self.embed_size, activation="relu" )(encoder)
        
        decoded = RepeatVector(self.maxlen)(encoder)
        decoder = GRU(self.embed_size, return_sequences=False, dropout=0.1, recurrent_dropout=0.1 ,name='GRU_decoder')(decoded)
        decoder = Dense(out_dim, activation="softmax",name="decoder_out")(decoder)
        autoencoder = Model(inputs=inp,outputs=decoder)
        autoencoder.compile(optimizer='rmsprop', loss='categorical_crossentropy')
        return autoencoder
    
    def tokenize_set(self,train_sentences,test_sentences):
        self.tokenizer = Tokenizer(num_words=self.vocab_size)
        self.tokenizer.fit_on_texts(list(train_sentences))
        list_tokenized_train = self.tokenizer.texts_to_sequences(train_sentences)
        list_tokenized_test = self.tokenizer.texts_to_sequences(test_sentences)
        X_t = pad_sequences(list_tokenized_train , maxlen=self.maxlen)
        X_te = pad_sequences(list_tokenized_test, maxlen=self.maxlen)
        return X_t,X_te
    
    def fit(self, X,sentences):  
        checkpoint = ModelCheckpoint(self.file_path, monitor='val_loss', verbose=1, save_best_only=True,save_weights_only = True, mode='min')
        early = EarlyStopping(monitor="val_loss", mode="min", patience=self.patience)
        callbacks_list = [ early,checkpoint]
        y_train = self.tokenizer.texts_to_matrix(sentences, mode='binary')
        self.autoencoder = self.init_model(out_dim = y_train.shape[1])
        self.history = self.autoencoder.fit(X, y_train, batch_size=self.batch_size, epochs=self.nepochs, validation_split=self.val_size,callbacks =callbacks_list ,shuffle=True)
        return self
    
    def predict(self,X):
        self.autoencoder.load_weights(self.file_path)
        return self.autoencoder.predict([X],batch_size=1024,verbose=1)

Preparing Input train data

list_titles = library.apply(lambda x : x['Title']+' '+x['Subjects'],axis=1).fillna("_na_").values


Train Attentional GRU autoencoder Model

model_with_attention = NeuralClassifier(attention = True,batch_size=batch_size,type = 'gru_attention' ,vocab_size = vocab_size,maxlen=maxlen,nepochs = 20)
train_features,_ = model_with_attention.tokenize_set(list_titles,[])
model_with_attention.fit(train_features,list_titles)

Train GRU Model without attention

model_without_attention = NeuralClassifier(attention = False,batch_size=batch_size,type = 'gru' ,vocab_size = vocab_size,maxlen=maxlen,nepochs = 20)
model_without_attention.tokenizer = model_with_attention.tokenizer
model_without_attention.fit(train_features,list_titles)

Validation Loss plotting of the autoencoder

def show_loss(history,name):
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title(name)
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
show_loss(model_with_attention.history,'GRU Model With Attention')
show_loss(model_without_attention.history,'GRU Model')

Clustering Books based on its title and subject using KMeans

from string import punctuation
exclude = set(punctuation)
cleaner = lambda title: ''.join(ch for ch in title if ch not in exclude)
from sklearn.cluster import KMeans
kmeans = KMeans()
feats  = model_with_attention.predict(train_features)
y_kmeans = kmeans.fit_transform(feats)
cluster_indx = 'Cluster_Index'
library[cluster_indx]  = kmeans.labels_
library['distance'] = y_kmeans.min(axis=1)
library.sort_values(by='distance',inplace=True,axis=0)
labels = library[cluster_indx].unique() 
wc = WordCloud(background_color="white", max_words=7,width=600, height=300)
for label in labels:
  word_cloud = wc.generate_from_frequencies({book_name:1 for book_name in library[library[cluster_indx]==label]['Title'].apply(lambda x: x[:15]).values.tolist()})
  plt.figure( figsize=(20,10) )
  plt.imshow(word_cloud,interpolation="bilinear")
  plt.title('Label name '+str(label)+' having '+str(library[library[cluster_indx]==label].shape[0])+' book')
  plt.show()

Some Statistics on the resulting clusters

import seaborn as sns
def plot_n_cluster_per(aggr_col):
    sns.set(font_scale=1) 
    f, ax = plt.subplots(figsize=(16, 32))
    max_n_observations = 10
    n_groups = library.groupby(aggr_col)[cluster_indx].nunique()
    n_groups = n_groups.nlargest(max_n_observations)
    colors_cw = sns.color_palette('cubehelix_r', len(n_groups))
    sns.barplot(n_groups.values,n_groups.keys(), palette = colors_cw[::-1])
    Text = ax.set(title='Type of Books published by the same '+aggr_col)
plot_n_cluster_per('Author')
plot_n_cluster_per('Publisher')
library.to_csv('output_clusters.csv',index=False)

References

  1. Bahdanau, D., Cho, K., & Bengio, Y. (2015). Neural Machine Translation by Jointly Learning to Align and Translate. In Y. Bengio & Y. LeCun (Eds.), 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings. http://arxiv.org/abs/1409.0473