我对 R 还很陌生,我正在尝试在一堆报告中运行文本分析和 TF-IDF,考虑到我构建的字典中的一组特定单词。下面的代码提供了结果,但是它没有考虑多个单词。例如,它可以算“技术”,但不能算“数据技术”。我需要修复代码,以便在分析中包含多个单词。
请参阅下面我使用的代码:
# Load libraries
library(tidyverse)
library(tm)
library(tidytext)
library(readxl)
# Setting the folder where the documents are (set to subfolder 2012 for now to make it easier to handle)
wd <- "C:/Users/ple.si/Dropbox (CBS)/Manegerial Digital Attention (MAD)/New set 10K/2012"
# Create the corpus and clean it up a bit
corpus <- Corpus(DirSource(wd, recursive = TRUE)) # Create corpus
corpus <- tm_map(corpus, removePunctuation) # remove punctuation
corpus <- tm_map(corpus, removeNumbers) # remove numbers
corpus <- tm_map(corpus, removeWords, stopwords("english")) # remove English stop words
# Create a DocumentTerm Matrix
dtm <- DocumentTermMatrix(corpus)
# Use multiple steps to...
corpus_words <- tidy(dtm) %>% # ... transform the dtm to a tidy object
bind_tf_idf(term, document, count) # ... use the tf_idf function from tidytext to calculate
total_words <- corpus_words %>% group_by(document) %>% summarize(total = sum(count)) # Calculate the number of words in each document
corpus_words <- left_join(corpus_words, total_words) # add it to the table
# Get the words of interest from the dictionary and rename the columns
dictionary <- read_xlsx("C:/Users/ple.si/Dropbox (CBS)/Manegerial Digital Attention (MAD)/New set 10K/DictionaryLIWCDigital_OnlyDigital_TG.xlsx", col_names = FALSE)
names(dictionary) <- c("term", "group")
# Take the individual term lists
inno_terms <- dictionary$term[dictionary$group==1]
techno_terms <- dictionary$term[dictionary$group==2]
data_terms <- dictionary$term[dictionary$group==3]
digital_terms <- dictionary$term[dictionary$group==4]
# Filter the corpus for the words of interest
TF_IDF_Inno_terms2 <- corpus_words %>% filter(grepl(paste(inno_terms, collapse = "|"), term))
TF_IDF_techno_terms2 <- corpus_words %>% filter(grepl(paste(techno_terms, collapse = "|"), term))
TF_IDF_data_terms2 <- corpus_words %>% filter(grepl(paste(data_terms, collapse = "|"), term))
TF_IDF_digital_terms2 <- corpus_words %>% filter(grepl(paste(digital_terms, collapse = "|"), term))
我尝试了此代码的不同变体来解决多字问题,但没有成功。
#ATTEMP 2#
# Function to check if all multi-word terms are present in a document
check_multiword <- function(doc, multiword_terms) {
all_terms_present <- all(sapply(multiword_terms, function(term) grepl(term, doc)))
return(all_terms_present)
}
# Filter the corpus for documents containing multi-word terms of interest
docs_with_multiword_inno <- Filter(function(doc) check_multiword(doc, inno_terms), corpus)
docs_with_multiword_techno <- Filter(function(doc) check_multiword(doc, techno_terms), corpus)
docs_with_multiword_data <- Filter(function(doc) check_multiword(doc, data_terms), corpus)
docs_with_multiword_digital <- Filter(function(doc) check_multiword(doc, digital_terms), corpus)
#ATTEMPT 3#
# Filter the corpus words for the documents containing multi-word terms of interest
corpus_words_inno <- corpus_words %>% filter(document %in% docs_with_multiword_inno)
corpus_words_techno <- corpus_words %>% filter(document %in% docs_with_multiword_techno)
corpus_words_data <- corpus_words %>% filter(document %in% docs_with_multiword_data)
corpus_words_digital <- corpus_words %>% filter(document %in% docs_with_multiword_digital)
#ATTEMPT 4#
# Function to check if all multi-word terms are present in a document
check_multiword <- function(doc, multiword_terms) {
all_terms_present <- all(sapply(multiword_terms, function(term) grepl(paste0("\\b", term, "\\b"), doc, ignore.case = TRUE)))
return(all_terms_present)
}
# Filter the corpus for documents containing multi-word terms of interest
docs_with_multiword_inno <- Filter(function(doc) check_multiword(doc, inno_terms), corpus)
docs_with_multiword_techno <- Filter(function(doc) check_multiword(doc, techno_terms), corpus)
docs_with_multiword_data <- Filter(function(doc) check_multiword(doc, data_terms), corpus)
docs_with_multiword_digital <- Filter(function(doc) check_multiword(doc, digital_terms), corpus)
# Filter the corpus words for the documents containing multi-word terms of interest
corpus_words_inno <- corpus_words %>% filter(document %in% docs_with_multiword_inno)
corpus_words_techno <- corpus_words %>% filter(document %in% docs_with_multiword_techno)
corpus_words_data <- corpus_words %>% filter(document %in% docs_with_multiword_data)
corpus_words_digital <- corpus_words %>% filter(document %in% docs_with_multiword_digital)
#ATTEMPT 5#
# Create the corpus and clean it up a bit
corpus <- Corpus(DirSource(wd, recursive = TRUE)) # Create corpus
corpus <- tm_map(corpus, removePunctuation) # remove punctuation
corpus <- tm_map(corpus, removeNumbers) # remove numbers
corpus <- tm_map(corpus, removeWords, stopwords("english")) # remove English stop words
# Tokenize the text into n-grams (multiword terms)
multiword_terms <- c("new product", "new products", "new technologies", "new services",
"new solutions", "renew services", "artificial intelligence",
"machine learning", "data technology", "data security",
"data protection", "personal data", "data collection",
"store data", "internal data", "external data", "data privacy",
"data centers", "data driven", "customer data", "data from customer",
"data science", "data collection", "data analysis", "big data",
"market data", "data sets")
custom_tokenizer <- function(x) {
unlist(lapply(ngrams(words(x), n = 1:2), paste, collapse = " "))
}
corpus <- tm_map(corpus, content_transformer(custom_tokenizer))
# Create a DocumentTerm Matrix
dtm <- DocumentTermMatrix(corpus)
# Get the words of interest from the dictionary and rename the columns
dictionary <- read_xlsx("C:/Users/ple.si/Dropbox (CBS)/Manegerial Digital Attention (MAD)/New set 10K/DictionaryLIWCDigital_OnlyDigital_TG.xlsx", col_names = FALSE)
names(dictionary) <- c("term", "group")
# Take the individual term lists
inno_terms <- c(dictionary$term[dictionary$group == 1], multiword_terms)
techno_terms <- c(dictionary$term[dictionary$group == 2], multiword_terms)
data_terms <- c(dictionary$term[dictionary$group == 3], multiword_terms)
digital_terms <- c(dictionary$term[dictionary$group == 4], multiword_terms)
# Use multiple steps to...
corpus_words <- tidy(dtm) %>% # ... transform the dtm to a tidy object
bind_tf_idf(term, document, count) # ... use the tf_idf function from tidytext to calculate
total_words <- corpus_words %>% group_by(document) %>% summarize(total = sum(count)) # Calculate the number of words in each document
corpus_words <- left_join(corpus_words, total_words) # add it to the table
# Filter the corpus for the words of interest
TF_IDF_Inno_terms5 <- corpus_words %>% filter(term %in% inno_terms)
TF_IDF_techno_terms5 <- corpus_words %>% filter(term %in% techno_terms)
TF_IDF_data_terms5 <- corpus_words %>% filter(term %in% data_terms)
TF_IDF_digital_terms5 <- corpus_words %>% filter(term %in% digital_terms)
无论代码如何,结果都不考虑多字。
谢谢:)
如果您想分析与单个单词不同的标记化级别的文本,您需要自己设置该标记化(可能不使用
Corpus()
)。
你可以找到这样的二元组:
library(tidyverse)
library(tidytext)
tidy_bigrams <- janeaustenr::austen_books() |>
unnest_tokens(bigram, text, token = "ngrams", n = 2)
bigram_counts <- tidy_bigrams |>
count(book, bigram, sort = TRUE) |>
filter(!is.na(bigram))
bigram_counts
#> # A tibble: 300,903 × 3
#> book bigram n
#> <fct> <chr> <int>
#> 1 Mansfield Park of the 712
#> 2 Mansfield Park to be 612
#> 3 Emma to be 586
#> 4 Mansfield Park in the 533
#> 5 Emma of the 529
#> 6 Pride & Prejudice of the 439
#> 7 Emma it was 430
#> 8 Pride & Prejudice to be 422
#> 9 Sense & Sensibility to be 418
#> 10 Emma in the 416
#> # ℹ 300,893 more rows
创建于 2023-08-15,使用 reprex v2.0.2
然后你可以像这样计算这些二元组的 tf-idf:
bigram_counts |>
bind_tf_idf(bigram, book, n) |>
arrange(desc(tf_idf))
#> # A tibble: 300,903 × 6
#> book bigram n tf idf tf_idf
#> <fct> <chr> <int> <dbl> <dbl> <dbl>
#> 1 Pride & Prejudice mr darcy 230 0.00206 1.79 0.00370
#> 2 Persuasion captain wentworth 143 0.00187 1.79 0.00335
#> 3 Mansfield Park sir thomas 266 0.00181 1.79 0.00324
#> 4 Persuasion mr elliot 133 0.00174 1.79 0.00312
#> 5 Sense & Sensibility mrs jennings 185 0.00169 1.79 0.00303
#> 6 Emma mr knightley 239 0.00162 1.79 0.00291
#> 7 Persuasion lady russell 110 0.00144 1.79 0.00258
#> 8 Persuasion sir walter 108 0.00141 1.79 0.00253
#> 9 Emma mrs weston 208 0.00141 1.79 0.00253
#> 10 Mansfield Park miss crawford 196 0.00133 1.79 0.00239
#> # ℹ 300,893 more rows
创建于 2023-08-15,使用 reprex v2.0.2
创建于 2023-08-15,使用 reprex v2.0.2
如果您想包含一元词(单词)和二元词,您可以在调用
n = 2, n_min = 1
时使用 unnest_tokens()
。