我用以下代码创建了一个python文件。我希望代码执行以下操作:
我想为大型文本文件运行此代码。
有人可以帮助我提高此代码的效率吗?我是新手,并在互联网的帮助下编写了此代码。
代码:
#<<<---------- INPUT TEXT FILE ------------>>>
# load data
filename = 'input.txt'
file = open(filename, 'rt')
text = file.read()
file.close()
#<<<---------- CLEAN TEXT ------------>>>
# split into words
import nltk
from nltk.tokenize import word_tokenize
tokens = nltk.word_tokenize(text)
# convert to lower case
tokens = [w.lower() for w in tokens]
# remove punctuation from each word
import string
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]
#join words as a sentence
cleantext = " ".join(words)
#<<<---------- CREATE UNIGRAMS ------------>>>
unigrm1 = nltk.word_tokenize(cleantext)
#<<<---------- CREATE BIGRAMS ------------>>>
tokens1 = nltk.word_tokenize(cleantext)
bigrm = nltk.bigrams(tokens1)
bigrm = list(nltk.bigrams(cleantext.split()))
bigrm1 = [' '.join(t) for t in bigrm]
#<<<---------- COMBINE UNIGRAMS & BIGRAMS ------------>>>
ngram1 = unigrm1 + bigrm1
ngram2 = ", ".join(ngram1)
#<<<---------- REMVOE DUPLCIATES IN BIGRAMS ------------>>>
# stop words removal
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
text_tokens = word_tokenize(ngram2)
tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
words = (" ").join(tokens_without_sw)
words = words.replace(" ,", ",")
words = words.replace(",,,,,", ",")
words = words.replace(",,,", ",")
words = words.replace(",,", ",")
words = words.split(", ")
words.sort()
# remove duplicates
k = []
for i in words:
# If condition is used to store unique string
# in another list 'k'
if (words.count(i)>1 and (i not in k)or words.count(i)==1):
k.append(i)
#<<<---------- SHOW NUMBER OF WORDS ------------>>>
countwords = text.split()
print('Number of words in raw file :', len(countwords))
file.close()
print('Number of words in extracted file :', len(k))
file.close()
#<<<---------- SAVE AS OUTPUT TEXT FILE ------------>>>
# save as text output
import sys
file = open('output.txt', 'w+')
sys.stdout = file
print(*map(''.join, k), sep=', ')
file.close()
#<<<---------- END OF CODES ------------>>>
可以在重置下一行的bigrm后删除此行。
bigrm = nltk.bigrams(tokens1)
在本节中,file.close()被调用了两次,但是文件没有打开,因此在两种情况下都可以丢弃file.close()。
#<<<---------- SHOW NUMBER OF WORDS ------------>>>
countwords = text.split()
print('Number of words in raw file :', len(countwords))
print('Number of words in extracted file :', len(k))
此外,sys.stdout使用后应重新设置。
orig_stdout = sys.stdout
sys.stdout = file
print(*map(''.join, k), sep=', ')
file.close()
sys.stdout = orig_stdout
至少您可以在运行代码后继续与终端进行交互,应该略加:)