我的信息检索代码有问题。
我只是想从我之前添加到语料库文件的 txt 文件中获取信息。一开始,一切都还好,检索到的信息并不惊人,而是一个好的开始。
但是,现在每次运行代码时,我都会得到这个错误:
UnicodeDecodeError:“utf-8”编解码器无法解码位置 3131 中的字节 0x80:起始字节无效
import nltk
import sys
import os
import math
from nltk.tokenize import wordpunct_tokenize
import string
import path
FILE_MATCHES = 1
SENTENCE_MATCHES = 1
def main():
# Check command-line arguments
if len(sys.argv) != 2:
sys.exit("Usage: python questions.py corpus")
else:
print("Hi! I hope you are having a good time. Thank you for contacting Appen today.")
# Calculate IDF values across files
files = load_files(sys.argv[1])
file_words = {
filename: tokenize(files[filename])
for filename in files
}
file_idfs = compute_idfs(file_words)
# Prompt user for query
query = set(tokenize(input( "What can I help you with today?: ")))
# Determine top file matches according to TF-IDF
filenames = top_files(query, file_words, file_idfs, n=FILE_MATCHES)
# Extract sentences from top files
sentences = dict()
for filename in filenames:
for passage in files[filename].split("\n"):
for sentence in nltk.sent_tokenize(passage):
tokens = tokenize(sentence)
if tokens:
sentences[sentence] = tokens
# Compute IDF values across sentences
idfs = compute_idfs(sentences)
# Determine top sentence matches
matches = top_sentences(query, sentences, idfs, n=SENTENCE_MATCHES)
for match in matches:
print(match)
def load_files(directory):
"""
Given a directory name, return a dictionary mapping the filename of each
`.txt` file inside that directory to the file's contents as a string.
"""
corpus = {}
abad = os.path.join(os.getcwd(), directory)
allfiles = os.listdir(directory)
for filename in allfiles:
with open((os.path.join(abad, filename)), mode='r', encoding="utf-8") as f:
doc = f.read().rstrip("\n")
corpus[filename] = doc
return corpus
def tokenize(document):
"""
Given a document (represented as a string), return a list of all of the
words in that document, in order.
Process document by coverting all words to lowercase, and removing any
punctuation or English stopwords.
"""
token = nltk.tokenize.word_tokenize(document.lower())
document = [x for x in token if x not in string.punctuation and x not in nltk.corpus.stopwords.words("english")]
return document
def compute_idfs(documents):
"""
Given a dictionary of `documents` that maps names of documents to a list
of words, return a dictionary that maps words to their IDF values.
Any word that appears in at least one of the documents should be in the
resulting dictionary.
"""
words = set()
for filename in documents:
words.update(documents[filename])
# Calculate IDFs
idfs = dict()
for word in words:
f = sum(word in documents[filename] for filename in documents)
idf = math.log(len(documents) / f)
idfs[word] = idf
return idfs
def top_files(query, files, idfs, n):
"""
Given a `query` (a set of words), `files` (a dictionary mapping names of
files to a list of their words), and `idfs` (a dictionary mapping words
to their IDF values), return a list of the filenames of the the `n` top
files that match the query, ranked according to tf-idf.
"""
#set up tfids dictionary
tfidfs = {}
#for the name and content in the files items check if the word is in them and if so add to score
for filename, filecon in files.items():
score = 0
for word in query:
if word in filecon:
score += filecon.count(word)* idfs[word]
if score != 0:
tfidfs[filename] = score
# Sort and get top n TF-IDFs for each file
print("Give me a moment to check that for you.")
sort = [k for k, v in sorted(tfidfs.items(), key=lambda y: y[1], reverse=True)]
return sort[:n]
def top_sentences(query, sentences, idfs, n):
"""
Given a `query` (a set of words), `sentences` (a dictionary mapping
sentences to a list of their words), and `idfs` (a dictionary mapping words
to their IDF values), return a list of the `n` top sentences that match
the query, ranked according to idf. If there are ties, preference should
be given to sentences that have a higher query term density.
"""
#set up tfids dictionary
tfidfs = []
#for the name and content in the sentences check if the word is in them and if so add to score
for sentence in sentences:
idf = 0
match = 0
for s in query:
if s in sentences[sentence]: # if query is in the sentence, add IDFS and record a match
idf += idfs[s]
match += 1
density = float(match)/len(sentences[sentence]) # calculate 'matching word measure'
tfidfs.append((sentence, idf, density))
# Sort and get top n TF-IDFs for sentence
tfidfs.sort(key=lambda x: (x[1], x[2]), reverse=True)
sort = [x[0] for x in tfidfs]
return sort [:n]
if __name__ == "__main__":
main()
我在网上查了这个问题,找到了几个答案,我试过了,但没有用。
此错误表明此文件不是 UTF-8 编码的。包含 0x80 的最常见情况是 Windows-1252(通常称为“ANSI”)和字符
€
.