from nltk import word_tokenize from collections import defaultdict from nltk import bigrams, trigrams import numpy as np #prosedyreren tar inn setninger som lister av ord def produce_text(input_sents): trigram_counts = defaultdict(lambda: defaultdict(lambda: 0)) trigram_model = defaultdict(lambda: defaultdict(lambda: 0)) #padding er om vi legger til et eget tegn som representerer setningsstart og setningsslutt eller ikke (None) for sentence in input_sents: for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True): trigram_counts[(w1, w2)][w3] += 1 for w1_w2 in trigram_counts: total_trigramcount = sum(trigram_counts[w1_w2].values()) for w3 in trigram_counts[w1_w2]: trigram_model[w1_w2][w3] = trigram_counts[w1_w2][w3]/total_trigramcount text = [None, None] sentence_is_finished = False while not sentence_is_finished: key = tuple(text[-2:]) words = list(trigram_model[key].keys()) probs = list(trigram_model[key].values()) text.append(np.random.choice(words, p=probs)) if text[-2:] == [None, None]: sentence_is_finished = True print(' '.join([t for t in text if t]))