编写代码来循环遍历语料库并创建一个包含每个文档中的一元和二元标记的倒排索引。
import os
import re
from collections import defaultdict
import time
corpus_directory = "corpus path"
def create_inverted_index(corpus_directory, encoding='utf-8'):
inverted_index = {}
for root, dirs, files in os.walk(corpus_directory):
for file in files:
file_path = os.path.join(root, file)
with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
document = f.read()
tokens = tokenize_text(document, lowercase=True)
bigrams = [" ".join(tokens[i:i+2]) for i in range(len(tokens) - 1)]
all_tokens = tokens + bigrams
for token in all_tokens:
if token not in inverted_index:
inverted_index[token] = []
inverted_index[token].append(file_path)
return inverted_index
创建了一个包含每个文档中的一元和二元标记的倒排索引。
我想回答_user_input_query
def answer_user_input_query(inverted_index):
query = input("Enter a phrase query: ")
start_time = time.time()
results = answer_phrase_query(query, inverted_index)
end_time = time.time()
if results:
print("Matching documents:")
for doc_path in results:
print(f"Document Path: {doc_path}")
else:
print("No matching documents found.")
print(f"Time taken to answer the query: {end_time - start_time} seconds")
inverted_index = create_inverted_index(corpus_directory)
answer_user_input_query(inverted_index)