from string import punctuation from collections import Counter from nltk import bigrams, trigrams, NaiveBayesClassifier, classify from nltk.corpus import PlaintextCorpusReader, stopwords # PREKODE def load_corpus(corpus_root): '''Tar inn rotmappen, som en streng, av et korpus. Returnerer NLTK-korpus.''' return PlaintextCorpusReader(corpus_root, '.*\.txt') # PREKODE def clean_words(words): '''Tar inn en liste med token. Returnerer listen uten tegnsetting og stoppord''' stopwords_nor = stopwords.words('norwegian') return [word.lower() for word in words if word not in punctuation and word not in stopwords_nor] # PREKODE def split_data(pos_feats, neg_feats): '''Tar inn lister med hhv. positive og negative trekk. Returnerer listene satt sammen og delt inn i train_set, dev_set, test_set.''' test_set = pos_feats[:122] + neg_feats[:122] dev_set = pos_feats[122:182] + neg_feats[122:182] train_set = pos_feats[182:] + neg_feats[182:] return train_set, dev_set, test_set # OPPGAVE 4.2 def feature_extractor_top_1000(document): features = {} fd = Counter(document) frequent_words = [word for word, count in fd.most_common(1000)] for word in frequent_words: features[f'contains({word})'] = True return features # OPPGAVE 4.3.1 def feature_extractor_bow(document): features = {} for word in clean_words(set(document)): features[f'contains({word})'] = True return features # OPPGAVE 4.3.2 def feature_extractor_bow_bigrams(document): features = feature_extractor_bow(document) for bigram in set(bigrams(document)): features[f'contains({bigram})'] = True return features # OPPGAVE 4.3.3 def feature_extractor_bow_bigrams_trigrams(document): features = feature_extractor_bow_bigrams(document) for trigram in set(trigrams(document)): features[f'contains({trigram})'] = True return features def main(): # OPPGAVE 4.1 reviews = load_corpus('NoReC/') pos_reviews = [] neg_reviews = [] for file in reviews.fileids(): words = [word.lower() for word in reviews.words(file)] if file.startswith('pos'): pos_reviews.append(words) elif file.startswith('neg'): neg_reviews.append(words) # OPPGAVE 4.2 print('1000 MEST FREKVENTE ORD ==========================') neg_features = [(feature_extractor_top_1000(review), 'neg') for review in neg_reviews] pos_features = [(feature_extractor_top_1000(review), 'pos') for review in pos_reviews] train_set_1000, dev_set_1000, test_set_1000 = split_data(neg_features, pos_features) print(f'len(train_set)={len(train_set_1000)} len(dev_set)={len(dev_set_1000)} len(test_set)={len(test_set_1000)}') classifier_1000 = NaiveBayesClassifier.train(train_set_1000) accuracy = classify.accuracy(classifier_1000, dev_set_1000) print('Nøyaktighet på dev_set:', accuracy) classifier_1000.show_most_informative_features(30) # Eksempler på stoppord er "har", "henne" og "som". Dette er ord som tendensielt # forekommer ofte i en tekst uten at de bidrar med særlig betydning for modellen. # Ved å luke ut tegnsetting og stoppord kan vi oppnå høyere nøyaktighet, fordi # vi får færre trekk med "nøytalt" sentiment. # OPPGAVE 4.3.1 print('\nBAG OF WORDS ===================================') neg_features = [(feature_extractor_bow(review), 'neg') for review in neg_reviews] pos_features = [(feature_extractor_bow(review), 'pos') for review in pos_reviews] train_set_bow, dev_set_bow, test_set_bow = split_data(neg_features, pos_features) print(f'len(train_set)={len(train_set_bow)} len(dev_set)={len(dev_set_bow)} len(test_set)={len(test_set_bow)}') classifier_bow = NaiveBayesClassifier.train(train_set_bow) accuracy = classify.accuracy(classifier_bow, dev_set_bow) print('Nøyaktighet på dev_set:', accuracy) classifier_bow.show_most_informative_features(10) # OPPGAVE 4.3.2 print('\nBAG OF WORDS + BIGRAM ==========================') neg_features = [(feature_extractor_bow_bigrams(review), 'neg') for review in neg_reviews] pos_features = [(feature_extractor_bow_bigrams(review), 'pos') for review in pos_reviews] train_set_bi, dev_set_bi, test_set_bi = split_data(neg_features, pos_features) print(f'len(train_set)={len(train_set_bi)} len(dev_set)={len(dev_set_bi)} len(test_set)={len(test_set_bi)}') classifier_bigram = NaiveBayesClassifier.train(train_set_bi) accuracy = classify.accuracy(classifier_bigram, dev_set_bi) print('Nøyaktighet på dev_set:', accuracy) classifier_bigram.show_most_informative_features(10) # OPPGAVE 4.3.3 print('\nBAG OF WORDS + BIGRAM + TRIGRAM ================') neg_features = [(feature_extractor_bow_bigrams_trigrams(review), 'neg') for review in neg_reviews] pos_features = [(feature_extractor_bow_bigrams_trigrams(review), 'pos') for review in pos_reviews] train_set_tri, dev_set_tri, test_set_tri = split_data(neg_features, pos_features) print(f'len(train_set)={len(train_set_tri)} len(dev_set)={len(dev_set_tri)} len(test_set)={len(test_set_tri)}') classifier_trigram = NaiveBayesClassifier.train(train_set_tri) accuracy = classify.accuracy(classifier_trigram, dev_set_tri) print('Nøyaktighet på dev_set:', accuracy) classifier_trigram.show_most_informative_features(10) # OPPGAVE 4.4 print('\nModellen med BoW, bigram og trigram gir høyest nøyaktighet på dev_set.') accuracy = classify.accuracy(classifier_trigram, test_set_tri) print('Nøyaktighet på test_set:', accuracy) # Forslag til forbedring: # Ta hensyn til negasjon slik at f.eks "ikke bra" blir "ikke NOT_bra" # Lemmatisering if __name__ == '__main__': main()