#!/usr/bin/env python # coding: utf-8 # In this example, instead of actual terms, we use letters to represent any arbritary unique term. # # # In[1]: # Task: construct an inverted index from this # For simplicity: lets pretend that these letters are terms corpus = [ "a a b c", # id 0 "a b c d c", # id 1 "b a a b d", # id 2 "c c c b e f g" # id 3 ] # In[2]: # We can use enumerate to see the ids with the corresponding documents for doc_id, document in enumerate(corpus): print(f"document {doc_id}: {document}") # What we want to create from this corpus: # Assign each term a posting list of document ids # In[3]: # An inverted index could look like this: sample_index = { "a": [0, 1, 2], "b": [0, 1, 2, 3], "c": [0, 1, 3], "d": [1, 2], "e": [3], "f": [3], "g": [3] } # In[4]: # print every document containing c: for i in sample_index["c"]: print("Document", i, ": ", corpus[i]) # How to build the index programatically # In[5]: index = {} # Code here for doc_id, document in enumerate(corpus): for token in set(document.split()): if token not in index: index[token] = [doc_id] else: index[token].append(doc_id) index # Here is another way to do it # In[6]: terms = set(" ".join(corpus).split()) terms # In[7]: index = {t: [i for i in range(len(corpus)) if t in corpus[i]] for t in terms} index # ## Note: # in the mandatory assignment: # * The inverted index is not implemented as a dictionary, like in this notebook # * The postings are objects containing both document ids and term frequencies, instead of single integers denoting only document ids # * The documents are divided into fields and have their own class # #