How to resolve the algorithm Inverted index step by step in the Nim programming language
Published on 12 May 2024 09:40 PM
How to resolve the algorithm Inverted index step by step in the Nim programming language
Table of Contents
Problem Statement
An Inverted Index is a data structure used to create full text search.
Given a set of text files, implement a program to create an inverted index. Also create a user interface to do a search using that inverted index which returns a list of files that contain the query term / terms. The search index can be in memory.
Let's start with the solution:
Step by Step solution about How to resolve the algorithm Inverted index step by step in the Nim programming language
Source code in the nim programming language
import os, strformat, strutils, tables
type
WordRef = tuple[docnum, linenum: int]
InvertedIndex = object
docs: seq[string]
index: Table[string, seq[WordRef]]
SearchResult = object
total: int
refs: seq[tuple[docname: string; linenums: seq[int]]]
IndexingError = object of CatchableError
const
# Characters composing a word (letters + "'").
WordChars = Letters + {'\''}
# Words to ignore.
StopWords = ["about", "above", "after", "again", "against", "all", "am", "an", "and", "any",
"are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below",
"between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did",
"didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each",
"few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have",
"haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's",
"hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll",
"i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself",
"let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not",
"of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours",
"ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll",
"she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's",
"the", "their", "theirs", "them", "themselves", "then", "there", "there's",
"these", "they", "they'd", "they'll", "they're", "they've", "this", "those",
"through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we",
"we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when",
"when's", "where", "where's", "which", "while", "who", "who's", "whom", "why",
"why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're",
"you've", "your", "yours", "yourself", "yourselves"]
template plural(n: int): string = (if n > 1: "s" else: "")
proc add(invIndex: var InvertedIndex; docPath: string) =
## Add a document to the index.
let docName = docPath.extractFilename()
if docName in invIndex.docs:
raise newException(IndexingError, &"{docName}: a document with this name is alreadry indexed.")
invIndex.docs.add docName
let docIndex = invIndex.docs.high
var linenum = 0
var count = 0
try:
for line in docPath.lines:
inc linenum
var word = ""
for ch in line:
if ch in WordChars:
word.add ch.toLowerAscii()
else:
if word.len > 1 and word notin StopWords:
invIndex.index.mgetOrPut(word, newSeq[WordRef]()).add (docIndex, linenum)
inc count
word = ""
except IOError:
raise newException(IndexingError, &"{docName}: error while reading file.")
if count > 0:
echo &"{docName}: {count} word{plural(count)} indexed."
else:
raise newException(IndexingError, &"{docName}: nothing to index.")
func status(invIndex: InvertedIndex): string =
## Display information about the inverted index status.
let words = invIndex.index.len
let docs = invIndex.docs.len
&"Index contains {words} word{plural(words)} from {docs} document{plural(docs)}."
proc search(invIndex: InvertedIndex; word: string): SearchResult =
## Search a word in the inverted index.
## The references are grouped by document.
let refs = invIndex.index.getOrDefault(word)
if refs.len == 0: return
result.total = refs.len
var prevdoc = ""
var prevline = 0
for (docnum, linenum) in refs:
let docname = invIndex.docs[docnum]
if docname == prevDoc:
if linenum != prevline:
result.refs[^1].linenums.add(linenum)
prevline = linenum
else:
result.refs.add((docname, @[linenum]))
prevdoc = docname
prevline = linenum
#———————————————————————————————————————————————————————————————————————————————————————————————————
var invertedIndex: InvertedIndex
if paramCount() != 1 or not paramStr(1).dirExists():
quit &"Usage: {getAppFileName().lastPathPart} directory"
# Index the documents.
for doc in walkFiles(paramStr(1) / "*.txt"):
try:
invertedIndex.add(doc)
except IndexingError:
echo getCurrentExceptionMsg()
echo invertedIndex.status()
echo ""
# Enter search interface.
var word: string
while true:
try:
stdout.write "Word to search? "
word = stdin.readLine().toLowerAscii().strip()
if word == "q":
echo "Quitting"
break
if not word.allCharsInSet(WordChars):
echo "This word contains invalid characters."
continue
except EOFError:
echo ""
echo "Quitting"
break
# Search word.
let result = invertedIndex.search(word)
if result.refs.len == 0:
echo "No reference found for this word."
continue
# Display the references.
echo &"Found {result.total} reference{plural(result.total)} to “{word}”:"
for (docname, linenums) in result.refs:
stdout.write &"... in “{docName}”, line{plural(linenums.len)}"
for num in linenums: stdout.write ' ', num
echo ""
You may also check:How to resolve the algorithm Faces from a mesh step by step in the Phix programming language
You may also check:How to resolve the algorithm HTTP step by step in the J programming language
You may also check:How to resolve the algorithm Tau function step by step in the C programming language
You may also check:How to resolve the algorithm Remove duplicate elements step by step in the Modula-2 programming language
You may also check:How to resolve the algorithm Doubly-linked list/Traversal step by step in the Scala programming language