如何使用python 3遍历阿拉伯文本并计算单词频率?
import arabic_reshaper
from pyarabic.araby import tokenize
from pyarabic.unshape import unshaping_text
from pyarabic import araby
text = 'your text here' # or your text path as follow
path = "path/file.txt"
text = open(path, encoding='utf-8')
text_to_be_reshaped = text
reshaped_text = arabic_reshaper.reshape(text_to_be_reshaped)
rev_text = reshaped_text[::-1] # slice backwards
dictionary = {}
lst = tokenize(rev_text)
for elements in lst:
if elements in dictionary:
dictionary[elements] += 1
else:
dictionary.update({elements: 1})
print(dictionary)