LSTM for real-time recommendation systems

How to prepare your data for LSTM models.

Now yes. Sequence No -> Yes

Everything is a sequence

Shopping list example

Users history as a text

Practical example

text = "Amazon,Google+,Instagram|Instagram,Evernote,Amazon,Yahoo|Evernote,Instagram,Google+"
from keras.utils import to_categorical
from keras_preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import numpy as np
tokenizer = Tokenizer()

tokenizer.fit_on_texts([text])
vocabulary_size = len(tokenizer.word_index) + 1
print('Unique items: %d' % vocabulary_size)
sequences = list()
for line in text.split('|'):
encoded = tokenizer.texts_to_sequences([line])[0]
sequences.append(encoded)
print('Total Sequences: %d' % len(sequences))
print(sequences)
#Output =>
#Total Sequences: 3
#[[2, 3, 1], [1, 4, 2, 5], [4, 1, 3]]
max_len = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')
print(sequences)
print('Max Sequence Length: %d' % max_len)
sequences = np.array(sequences)
#Output =>
#[[0 2 3 1]
# [1 4 2 5]
# [0 4 1 3]]
X, y = sequences[:, :-1], sequences[:, -1]
print(X)
print(y)
# Output
#[[0 2 3]
# [1 4 2]
# [0 4 1]] -- samples
# [1 5 3] -- targets
y = to_categorical(y, num_classes=vocabulary_size)
print(y)
# Output
#[[0. 1. 0. 0. 0. 0.]
# [0. 0. 0. 0. 0. 1.]
# [0. 0. 0. 1. 0. 0.]]
from keras import Sequential
from keras.layers import Embedding, Dropout, LSTM, Dense
model = Sequential()
model.add(Embedding(vocabulary_size, 5, input_length=max_len - 1))
model.add(Dropout(0.2))
model.add(LSTM(3))
model.add(Dropout(0.2))
model.add(Dense(vocabulary_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
h = model.fit(X, y, validation_split=0.2, verbose=1, epochs=10)
import matplotlib.pyplot as plt

plt.plot(h.history['loss'], label='Train loss')
plt.plot(h.history['val_loss'], label='Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.show()
plt.plot(h.history['acc'], label='Train accuracy')
plt.plot(h.history['val_acc'], label='Validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.show()
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
result = list()
in_text = seed_text
for _ in range(n_words):
encoded = tokenizer.texts_to_sequences([in_text])[0]
encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
yhat = model.predict_classes(encoded, verbose=0)
out_word = ''
for word, index in tokenizer.word_index.items():
if index == yhat:
out_word = word
break
in_text += ' ' + out_word
result.append(out_word)
return ' '.join(result)
print(generate_seq(model, tokenizer, 3, '1 3 5', 1))
#Output
# instagram

Conclusion

Young and positive thinking machine learning engineer, living in Ukraine.