print('OPPGAVE 3.1 -----------------------------')
from nltk.corpus import gutenberg

bible_words = gutenberg.words('bible-kjv.txt')
total_words = len(bible_words)

print('Antall tokens:', total_words)


print('\nOPPGAVE 3.2 -----------------------------')
distinct_words = []
for word in bible_words:
    distinct_words.append(word.lower())

total_distinct_words = len(set(distinct_words))

print('Antall ordtyper:', total_distinct_words)


print('\nOPPGAVE 3.3 -----------------------------')
from collections import Counter

fd_bible_words = Counter(bible_words)

print('Mest frekvente ord:\n', fd_bible_words.most_common(20))


print('\nOPPGAVE 3.4 -----------------------------')
print('Frekvens for "earth":', fd_bible_words['earth'])
print('Frekvens for "life":', fd_bible_words['life'])
print('Frekvens for "death":', fd_bible_words['death'])


print('\nOPPGAVE 3.5 -----------------------------')
from nltk import bigrams

bible_sents = gutenberg.sents('bible-kjv.txt')

seventh_sentence = bible_sents[6]

print('Bigrammer syvende setning:\n', list(bigrams(seventh_sentence, pad_left=True, pad_right=True)))


print('\nOPPGAVE 3.6 -----------------------------')
from nltk import trigrams

eigth_sentence = bible_sents[7]

print('Trigrammer åttende setning:\n', list(trigrams(eigth_sentence, pad_left=True, pad_right=True)))


print('\nOPPGAVE 3.7 -----------------------------')
from collections import defaultdict
import numpy as np

# Tellinger
bigram_counts = defaultdict(lambda: defaultdict(lambda: 0))

for sentence in bible_sents:
    for w1, w2 in bigrams(sentence, pad_right=True, pad_left=True):
        bigram_counts[w1][w2] += 1

# Sannsynligheter
bigram_model = defaultdict(lambda: defaultdict(lambda: 0.0))

for w1 in bigram_counts:
    total_bigramcount = sum(bigram_counts[w1].values())
    for w2 in bigram_counts[w1]:
        bigram_model[w1][w2] = bigram_counts[w1][w2] / total_bigramcount

# Tekstgenerering
text = [None]
sentence_is_finished = False
all_probs = []

while not sentence_is_finished:
    key = text[-1]
    words = list(bigram_model[key].keys())
    probs = list(bigram_model[key].values())
    text.append(np.random.choice(words, p=probs))

    if len(text) >= 50 and text[-1] == None:
        sentence_is_finished = True

text_string = ' '.join([token for token in text if token])
print('Generert tekst:\n', text_string)


print('\nOPPGAVE 3.8 -----------------------------')
bigram_probabilities = []
text_bigrams = bigrams(text)

for w1, w2 in text_bigrams:
    bigram_probabilities.append(bigram_model[w1][w2])

print('Tekstsannsynlighet:', np.prod(bigram_probabilities))


# OPPGAVE 4.1 ------------------------------------
import nltk

patterns = [(r'\b[wW]at\b|\b[wW]\b', 'WP'), # wh-pronouns
            (r'\bhere\b', 'RN'), # here
            (r',', ','), # komma
            (r'\.', '.'), # punktum
            (r'\b[mM]y\b|\b[yY]our\b|\b[hH]is\b', 'PRP\$'),  # possessive pronouns my, your, his, ...
            (r'\b([sS]|[hH])e\b|\b[iI]\b|\b[yY]ou\b', 'PRP'), # personal pronouns I, you , he, she, ...
            (r'\b[oO]f\b|\b[iI]n\b|\b[bB]y\b', 'IN'), # preposition or subordinating conjunction of, in, by ...
            (r'\b[cC]an\b|\b[cC]ould\b|\b[sS]ould\b|\b[wW]ill\b|\b[wW]ould\b', 'MD'), # modal
            (r'.*est\$', 'JJS'), # adj. superlative (f.eks. wildest)
            (r'.*ly\$', 'RB'), # adverb (f.eks. strongly, massivly)
            (r'and\$|or\$', 'CC'), # conjunction and|or
            (r'\b[tT]he\b|\b[aA]n?\b|\bthat\b', 'DT'), # determiner T(t)he, a, an
            (r'.*able$', 'JJ'), # adjective (f.eks. printable, manageable)

            # Fra NLTK-boka/oppgavesettet
            (r'.*ing$', 'VBG'),
            (r'.*ed$', 'VBD'),
            (r'.*es$', 'VBZ'),
            (r'.*ould$', 'MD'),
            (r'.*\'s$', 'NN$'),
            (r'.*s$', 'NNS'),
            (r'ˆ-?[0-9]+(\.[0-9]+)?$', 'CD'),

            # Default-tag til slutt som en siste utvei
            (r'.*', 'NN')]

regexp_tagger = nltk.RegexpTagger(patterns)


print('\nOPPGAVE 4.2 -----------------------------')
from nltk.corpus import brown

# Henter ut setninger fra korpuset
brown_tagged_sents = brown.tagged_sents(categories='adventure')
brown_sents = brown.sents(categories='adventure')
brown_tagged_sents_fiction = brown.tagged_sents(categories='fiction')

# Inspiserer en setning
setning_indeks = 14
print('Ikke-tagget setning:\n', brown_sents[setning_indeks])
print('Modellens taggede forsøk:\n', regexp_tagger.tag(brown_sents[setning_indeks]))

# Evaluerer på adventure-kategorien
print('Nøyaktighet, adventure:', regexp_tagger.evaluate(brown_tagged_sents))

# Evaluerer på fiction-kategorien
print('Nøyaktighet, fiction:', regexp_tagger.evaluate(brown_tagged_sents_fiction))


# OPPGAVE 4.3 ------------------------------------
innfil = open("setninger.txt", encoding='utf-8')
utfil = open("taggede_setninger.txt", "w", encoding='utf-8')

tagged_sentences = []
for linje in innfil:
    tokenized_sentence = nltk.word_tokenize(linje)
    tagged_sentences.append(regexp_tagger.tag(tokenized_sentence))

for tagged_sent in tagged_sentences:
    utfil.write(str(tagged_sent) + '\n')

innfil.close()
utfil.close()