#Movie Term Chatbot

from __future__ import print_function

import numpy as np
import keras
from keras.datasets import reuters
from keras.datasets import imdb#reuters
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Dropout, Activation, LSTM
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Flatten
from keras.preprocessing.text import Tokenizer
import random

batch_size = 32
max_words = 1000
epochs = 2

#tempdata = []
tempdata = open("movie_lines.txt").readlines()
x_train_temp = np.array([i.split("+++$+++")[-1].strip() for i in tempdata[:1000]])
y_train_temp = [i.split("+++$+++")[-2].strip() for i in tempdata[:1000]]

#Categories
categories = list(set(y_train_temp))
print("Categories:"+str(categories))
y_train = np.array([categories.index(i) for i in y_train_temp])
print(y_train)
num_classes = len(set(y_train))
print(num_classes, 'classes')


#Storing original for later use
x_train_org = x_train_temp[:]
y_train_org = y_train[:]

#Embeddings
allwords = ' '.join(x_train_temp).lower().split(' ')
uniquewords = list(set(allwords))
#Stemming
from nltk.stem import *
stemmer = PorterStemmer()
def fixWord(word):
    #word = stemmer.stem(word)
    #word = stemmer.stem(word.decode("utf-8","ignore"))
    #try:
    #    word = stemmer.stem(word.decode("utf-8","ignore"))
    #    word = word.replace(".","")
    #except 
    return word

uniquewords = [fixWord(i) for i in uniquewords]
print(uniquewords)
import pdb;pdb.set_trace()
x_train = []
for i in x_train_temp:
    #import pdb;pdb.set_trace()
    iwords = i.lower().split(' ')
    numbers = [uniquewords.index(fixWord(i)) for i in iwords]
    x_train.append(numbers)

x_train = np.array(x_train)
print(x_train)

#Make binary embeddings
tokenizer = Tokenizer(num_words=max_words)
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
y_train = keras.utils.to_categorical(y_train, num_classes)

print([i for i in x_train[0]])

embedding_vector_length = 32

#Model
#model = Sequential()
#model.add(Embedding(max_words, embedding_vector_length, input_length=max_words))
#model.add(Flatten())
#model.add(Embedding(max_words, embedding_vector_length, input_length=max_words))
#model.add(Flatten())
#model.add(Dense(512, input_shape=(max_words,)))
#model.add(Activation('relu'))
#model.add(Dropout(0.5))
#model.add(Dense(num_classes))
#model.add(Activation('softmax'))

model = Sequential()
model.add(Embedding(max_words, embedding_vector_length, input_length=max_words))
model.add(LSTM(100))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

#LSTM Model
#model = Sequential()
#model.add(Embedding(max_words, embedding_vector_length, input_length=max_words))
#model.add(LSTM(100))
#model.add(Dense(num_classes))
#model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

def getCategory(inputString):
    token = tokenizer.sequences_to_matrix(np.array([inputString,x_train_org[0]]))
    aindex = np.argmax(model.predict(np.array([token[0]])))
    return aindex

def getRandomTextFromIndex(aIndex):
    res = -1
    while res!=aIndex:
        
        aNumber = random.randint(0,len(y_train_org))
        res = y_train_org[aNumber]
    return x_train_org[aNumber] 

s = " "
while s:
    category = getCategory(s)
    text = getRandomTextFromIndex(category)
    print("Chatbot:" + text)
    s = raw_input("Human:")