print('OPPGAVE 3.1 -----------------------------') from nltk.corpus import gutenberg bible_words = gutenberg.words('bible-kjv.txt') total_words = len(bible_words) print('Antall tokens:', total_words) print('\nOPPGAVE 3.2 -----------------------------') distinct_words = [] for word in bible_words: distinct_words.append(word.lower()) total_distinct_words = len(set(distinct_words)) print('Antall ordtyper:', total_distinct_words) print('\nOPPGAVE 3.3 -----------------------------') from collections import Counter fd_bible_words = Counter(bible_words) print('Mest frekvente ord:\n', fd_bible_words.most_common(20)) print('\nOPPGAVE 3.4 -----------------------------') print('Frekvens for "earth":', fd_bible_words['earth']) print('Frekvens for "life":', fd_bible_words['life']) print('Frekvens for "death":', fd_bible_words['death']) print('\nOPPGAVE 3.5 -----------------------------') from nltk import bigrams bible_sents = gutenberg.sents('bible-kjv.txt') seventh_sentence = bible_sents[6] print('Bigrammer syvende setning:\n', list(bigrams(seventh_sentence, pad_left=True, pad_right=True))) print('\nOPPGAVE 3.6 -----------------------------') from nltk import trigrams eigth_sentence = bible_sents[7] print('Trigrammer åttende setning:\n', list(trigrams(eigth_sentence, pad_left=True, pad_right=True))) print('\nOPPGAVE 3.7 -----------------------------') from collections import defaultdict import numpy as np # Tellinger bigram_counts = defaultdict(lambda: defaultdict(lambda: 0)) for sentence in bible_sents: for w1, w2 in bigrams(sentence, pad_right=True, pad_left=True): bigram_counts[w1][w2] += 1 # Sannsynligheter bigram_model = defaultdict(lambda: defaultdict(lambda: 0.0)) for w1 in bigram_counts: total_bigramcount = sum(bigram_counts[w1].values()) for w2 in bigram_counts[w1]: bigram_model[w1][w2] = bigram_counts[w1][w2] / total_bigramcount # Tekstgenerering text = [None] sentence_is_finished = False all_probs = [] while not sentence_is_finished: key = text[-1] words = list(bigram_model[key].keys()) probs = list(bigram_model[key].values()) text.append(np.random.choice(words, p=probs)) if len(text) >= 50 and text[-1] == None: sentence_is_finished = True text_string = ' '.join([token for token in text if token]) print('Generert tekst:\n', text_string) print('\nOPPGAVE 3.8 -----------------------------') bigram_probabilities = [] text_bigrams = bigrams(text) for w1, w2 in text_bigrams: bigram_probabilities.append(bigram_model[w1][w2]) print('Tekstsannsynlighet:', np.prod(bigram_probabilities)) # OPPGAVE 4.1 ------------------------------------ import nltk patterns = [(r'\b[wW]at\b|\b[wW]\b', 'WP'), # wh-pronouns (r'\bhere\b', 'RN'), # here (r',', ','), # komma (r'\.', '.'), # punktum (r'\b[mM]y\b|\b[yY]our\b|\b[hH]is\b', 'PRP\$'), # possessive pronouns my, your, his, ... (r'\b([sS]|[hH])e\b|\b[iI]\b|\b[yY]ou\b', 'PRP'), # personal pronouns I, you , he, she, ... (r'\b[oO]f\b|\b[iI]n\b|\b[bB]y\b', 'IN'), # preposition or subordinating conjunction of, in, by ... (r'\b[cC]an\b|\b[cC]ould\b|\b[sS]ould\b|\b[wW]ill\b|\b[wW]ould\b', 'MD'), # modal (r'.*est\$', 'JJS'), # adj. superlative (f.eks. wildest) (r'.*ly\$', 'RB'), # adverb (f.eks. strongly, massivly) (r'and\$|or\$', 'CC'), # conjunction and|or (r'\b[tT]he\b|\b[aA]n?\b|\bthat\b', 'DT'), # determiner T(t)he, a, an (r'.*able$', 'JJ'), # adjective (f.eks. printable, manageable) # Fra NLTK-boka/oppgavesettet (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'), (r'.*ould$', 'MD'), (r'.*\'s$', 'NN$'), (r'.*s$', 'NNS'), (r'ˆ-?[0-9]+(\.[0-9]+)?$', 'CD'), # Default-tag til slutt som en siste utvei (r'.*', 'NN')] regexp_tagger = nltk.RegexpTagger(patterns) print('\nOPPGAVE 4.2 -----------------------------') from nltk.corpus import brown # Henter ut setninger fra korpuset brown_tagged_sents = brown.tagged_sents(categories='adventure') brown_sents = brown.sents(categories='adventure') brown_tagged_sents_fiction = brown.tagged_sents(categories='fiction') # Inspiserer en setning setning_indeks = 14 print('Ikke-tagget setning:\n', brown_sents[setning_indeks]) print('Modellens taggede forsøk:\n', regexp_tagger.tag(brown_sents[setning_indeks])) # Evaluerer på adventure-kategorien print('Nøyaktighet, adventure:', regexp_tagger.evaluate(brown_tagged_sents)) # Evaluerer på fiction-kategorien print('Nøyaktighet, fiction:', regexp_tagger.evaluate(brown_tagged_sents_fiction)) # OPPGAVE 4.3 ------------------------------------ innfil = open("setninger.txt", encoding='utf-8') utfil = open("taggede_setninger.txt", "w", encoding='utf-8') tagged_sentences = [] for linje in innfil: tokenized_sentence = nltk.word_tokenize(linje) tagged_sentences.append(regexp_tagger.tag(tokenized_sentence)) for tagged_sent in tagged_sentences: utfil.write(str(tagged_sent) + '\n') innfil.close() utfil.close()