analyze_document <- function(filename) {
# Read text data
text <- readLines(filename, encoding = "UTF-8") %>% paste(collapse = " ")
# Preprocessing
text <- str_to_lower(text) # Use stringr's function
text <- gsub("[^[:alnum:]]", "", text) # Remove punctuation
tokens <- word_tokenizer(text)
# Remove stop words
stopwords <- stopwords("english")
filtered_tokens <- tokens[!tokens %in% stopwords]
# Word frequency analysis
word_counts <- DocumentTermMatrix(Corpus(VectorSource(filtered_tokens))) %>%
as.matrix() %>%
apply(1, sum) %>% # Apply sum to each row (document)
sort(decreasing = TRUE)
# Print the top 10 most frequent words and their counts within the function
print(head(word_counts, 10))
# Generate report title
report_title <- paste("## Word Frequency Analysis for", filename, sep = " ")
# Generate abstract
abstract <- paste0("This report summarizes the word frequency distribution in the document ",
filename, ". It aims to identify the most frequently used words ",
"excluding common stop words.\n\n")
# Generate table header
table_header <- paste("| Word | Frequency |\n", "|---|---| \n")
# Generate table rows
table_rows <- sapply(rownames(word_counts), function(word) paste0("| ", word, " | ", word_counts[word], " |\n"))
# Generate conclusion
conclusion <- paste0("\nThe table displays the most frequent words after removing stop words. ",
"Analyzing word frequency can help identify potentially overused words ",
"or repetitive language patterns, aiding in revising and refining the text.\n")
# Combine report sections
report <- paste(report_title, abstract, table_header, table_rows, conclusion, sep = "")
# Return word counts in addition to the report
return(list(report = report, word_counts = word_counts))
}
# Define filename with corrected path
filename <- "C:\\Users\\bruh\\OneDrive\\Documents\\dictionary.txt"
# Call the function with the filename
report <- analyze_document(filename)
# Inside the analyze_document function, after sorting the word_counts:
# Print or save the report
cat(report$report)
# save the report to a text file
write.table(report$word_counts, "word_counts.txt", row.names = TRUE) # Use write.table for structured data
我正在尝试使用 R 分析文本文档中的词频,但结果并不理想。 代码运行没有错误,但生成的字数只是一个值为 1 的单元素向量。 根据此数据生成的报告是空的。
预期行为:
我预计会出现一个按降序排列的词频列表,以及一份包含最常用词表的报告。
尝试的解决方案:
我尝试过以下方法:
附加信息:
具体问题:
我非常感谢任何解决此问题并确保准确的词频分析的见解或指导。
谢谢!
你那里有相当长的 GPT 函数。
在我看来,你的主要问题是获取实际的 tf-idf 值。其余的都是基于它。
试试这个:
library(tidyverse)
library(tidytext) # unnest tokens + tf-idf approach
library(tidylo) # Julia Selige's wonderful log-odds approach
# a dummy tibble with documents and their terms.
function_description <-
tibble(
document = c("abstract", "conclusion"),
term = c(
"This report summarizes the word frequency distribution in the document,
filename. It aims to identify the most frequently used words, excluding common stop words.",
"The table displays the most frequent words after removing stop words.,
Analyzing word frequency can help identify potentially overused words,
or repetitive language patterns, aiding in revising and refining the text."
)
)
function_description %>%
unnest_tokens(word, term) %>% # first unnest tokens to word level
count(document,word, sort = TRUE) %>% # then count frequencies
bind_tf_idf(word, document, n) %>% # compute tf idf
tidylo::bind_log_odds(set = document, feature = word, n = n) # compute log odds
R 中的 NLP:https://www.tidytextmining.com/ 在 YouTube 上搜索 Julia Selige。她很棒,有很多 NLP 教程