#Movie Term Chatbot from __future__ import print_function import numpy as np import keras from keras.datasets import reuters from keras.datasets import imdb#reuters from keras.models import Sequential from keras.layers.embeddings import Embedding from keras.layers import Dense, Dropout, Activation, LSTM from keras.layers import Conv2D, MaxPooling2D from keras.layers import Flatten from keras.preprocessing.text import Tokenizer import random batch_size = 32 max_words = 1000 epochs = 2 #tempdata = [] tempdata = open("movie_lines.txt").readlines() x_train_temp = np.array([i.split("+++$+++")[-1].strip() for i in tempdata[:1000]]) y_train_temp = [i.split("+++$+++")[-2].strip() for i in tempdata[:1000]] #Categories categories = list(set(y_train_temp)) print("Categories:"+str(categories)) y_train = np.array([categories.index(i) for i in y_train_temp]) print(y_train) num_classes = len(set(y_train)) print(num_classes, 'classes') #Storing original for later use x_train_org = x_train_temp[:] y_train_org = y_train[:] #Embeddings allwords = ' '.join(x_train_temp).lower().split(' ') uniquewords = list(set(allwords)) #Stemming from nltk.stem import * stemmer = PorterStemmer() def fixWord(word): #word = stemmer.stem(word) #word = stemmer.stem(word.decode("utf-8","ignore")) #try: # word = stemmer.stem(word.decode("utf-8","ignore")) # word = word.replace(".","") #except return word uniquewords = [fixWord(i) for i in uniquewords] print(uniquewords) import pdb;pdb.set_trace() x_train = [] for i in x_train_temp: #import pdb;pdb.set_trace() iwords = i.lower().split(' ') numbers = [uniquewords.index(fixWord(i)) for i in iwords] x_train.append(numbers) x_train = np.array(x_train) print(x_train) #Make binary embeddings tokenizer = Tokenizer(num_words=max_words) x_train = tokenizer.sequences_to_matrix(x_train, mode='binary') y_train = keras.utils.to_categorical(y_train, num_classes) print([i for i in x_train[0]]) embedding_vector_length = 32 #Model #model = Sequential() #model.add(Embedding(max_words, embedding_vector_length, input_length=max_words)) #model.add(Flatten()) #model.add(Embedding(max_words, embedding_vector_length, input_length=max_words)) #model.add(Flatten()) #model.add(Dense(512, input_shape=(max_words,))) #model.add(Activation('relu')) #model.add(Dropout(0.5)) #model.add(Dense(num_classes)) #model.add(Activation('softmax')) model = Sequential() model.add(Embedding(max_words, embedding_vector_length, input_length=max_words)) model.add(LSTM(100)) model.add(Dense(num_classes)) model.add(Activation('softmax')) #LSTM Model #model = Sequential() #model.add(Embedding(max_words, embedding_vector_length, input_length=max_words)) #model.add(LSTM(100)) #model.add(Dense(num_classes)) #model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.1) def getCategory(inputString): token = tokenizer.sequences_to_matrix(np.array([inputString,x_train_org[0]])) aindex = np.argmax(model.predict(np.array([token[0]]))) return aindex def getRandomTextFromIndex(aIndex): res = -1 while res!=aIndex: aNumber = random.randint(0,len(y_train_org)) res = y_train_org[aNumber] return x_train_org[aNumber] s = " " while s: category = getCategory(s) text = getRandomTextFromIndex(category) print("Chatbot:" + text) s = raw_input("Human:")