用于python简单模块的抄袭检测的 Winnowing算法

问题描述 投票:-3回答:1

enter image description here

计算k-gram文本的哈希值并形成哈希窗口

python hash text-mining plagiarism-detection
1个回答
0
投票
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.util import ngrams

def pre_processing(filename):
   try:
       with open(filename) as f_obj:
           contents = f_obj.read()
   except FileNotFoundError:
       msg = "Sorry, the file " + filename + " does not exist"
       return msg
   else:
       # removing numbmers
       remove_numbers = re.sub(r'\d+','',contents)
       # word tokenizing
       tonkenize_word = nltk.word_tokenize(remove_numbers)
       # remove_punctuations
       punt_removed =  " ".join([w for w in tonkenize_word if w.lower() not in 
       string.punctuation])
       # removing stopwords
       tokenize_punt_num_rem = nltk.word_tokenize(punt_removed)
       lang_stopwords = stopwords.words('english')
       stopwords_removed = " ".join([w for w in tokenize_punt_num_rem if w.lower() not 
       in lang_stopwords])
       # stemming
       stem_words = []
       stemmer = SnowballStemmer('english')
       for word in nltk.word_tokenize(stopwords_removed):
          stem_words.append(stemmer.stem(word))

       stemmed_words = "".join(stem_words)

       n_grams = ngrams(stemmed_words, 3)
       tri_gram = [ ''.join(grams) for grams in n_grams]
       return tri_gram


#forming tri-gram of text
#calling function that pre-process text and return tri-gram
tri_gram = pre_processing('text_filename.txt')

#module to calculate hash value of each tri-gram
def calc_hash(txt):
    t = 0
    for i in range(3): 
    t = (26*t + ord(txt[i])) % 997 
    return t

#saving hash value in dictionary
hash_value = {}
outside_list = []
for i in range(len(hash_value)):
   inside_list = []
   for j in range(i,i+3):
       inside_list.append(hash_value.get(j))
   outside_list.append(inside_list)

print(outside_list)
© www.soinside.com 2019 - 2024. All rights reserved.