import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.util import ngrams
def pre_processing(filename):
try:
with open(filename) as f_obj:
contents = f_obj.read()
except FileNotFoundError:
msg = "Sorry, the file " + filename + " does not exist"
return msg
else:
# removing numbmers
remove_numbers = re.sub(r'\d+','',contents)
# word tokenizing
tonkenize_word = nltk.word_tokenize(remove_numbers)
# remove_punctuations
punt_removed = " ".join([w for w in tonkenize_word if w.lower() not in
string.punctuation])
# removing stopwords
tokenize_punt_num_rem = nltk.word_tokenize(punt_removed)
lang_stopwords = stopwords.words('english')
stopwords_removed = " ".join([w for w in tokenize_punt_num_rem if w.lower() not
in lang_stopwords])
# stemming
stem_words = []
stemmer = SnowballStemmer('english')
for word in nltk.word_tokenize(stopwords_removed):
stem_words.append(stemmer.stem(word))
stemmed_words = "".join(stem_words)
n_grams = ngrams(stemmed_words, 3)
tri_gram = [ ''.join(grams) for grams in n_grams]
return tri_gram
#forming tri-gram of text
#calling function that pre-process text and return tri-gram
tri_gram = pre_processing('text_filename.txt')
#module to calculate hash value of each tri-gram
def calc_hash(txt):
t = 0
for i in range(3):
t = (26*t + ord(txt[i])) % 997
return t
#saving hash value in dictionary
hash_value = {}
outside_list = []
for i in range(len(hash_value)):
inside_list = []
for j in range(i,i+3):
inside_list.append(hash_value.get(j))
outside_list.append(inside_list)
print(outside_list)