我正在开发一个程序,该程序读取文本文件并按升序显示前 10 个最常用的单词并打印出来。我已经定义了停用词/连接词并编写了代码以将它们从频繁词分析中删除,但是,停用词仍然包含在分析中。
% Prints the words
print_top_words(File, N):-
read_file_to_string(File, String, [encoding(utf8)]),
re_split("\\w+", String, Words),
lower_case(Words, Lower),
sort(1, @=<, Lower, Sorted),
exclude(word_to_ignore, Sorted, RelevantWords),
merge_words(RelevantWords, Counted),
sort(2, @>, Counted, Top_words),
writef("Top %w words:\nRank\tCount\tWord\n", [N]),
print_top_words(Top_words, N, 1).
% Predicate to filter out words that are to be ignored.
word_to_ignore(Word) :-
ignore_words(IgnoreWords),
member(Word, IgnoreWords).
% Defines the words to ignore.
ignore_words(['', 'a', 'an', 'the', 'for', 'of', 'and', 'to', 'in', 'is', 'it', 'on', 'that', 'with', 'this', 'you', 'be', 'are', 'at', 'or', 'as', 'if', 'not', 'from']).
lower_case([_], []):-!.
lower_case([_, Word|Words], [Lower - 1|Rest]):-
string_lower(Word, Lower),
lower_case(Words, Rest).
merge_words([], []):-!.
merge_words([Word - C1, Word - C2|Words], Result):-
!,
C is C1 + C2,
merge_words([Word - C|Words], Result).
merge_words([W|Words], [W|Rest]):-
merge_words(Words, Rest).
print_top_words([], _, _):-!.
print_top_words(_, 0, _):-!.
print_top_words([Word - Count|Rest], N, R):-
writef("%w\t%w\t%w\n", [R, Count, Word]),
N1 is N - 1,
R1 is R + 1,
print_top_words(Rest, N1, R1).
main:-
print_top_words("SuspiciousEmail.txt", 10).