R 中的多词和 TF-IDF 文本分析

问题描述 投票:0回答:1

我对 R 还很陌生,我正在尝试在一堆报告中运行文本分析和 TF-IDF,考虑到我构建的字典中的一组特定单词。下面的代码提供了结果,但是它没有考虑多个单词。例如,它可以算“技术”,但不能算“数据技术”。我需要修复代码,以便在分析中包含多个单词。

请参阅下面我使用的代码:

# Load libraries
library(tidyverse)
library(tm)
library(tidytext)
library(readxl)

# Setting the folder where the documents are (set to subfolder 2012 for now to make it easier to handle)
wd <- "C:/Users/ple.si/Dropbox (CBS)/Manegerial Digital Attention (MAD)/New set 10K/2012"

# Create the corpus and clean it up a bit
corpus <- Corpus(DirSource(wd, recursive = TRUE)) # Create corpus
corpus <- tm_map(corpus, removePunctuation) # remove punctuation
corpus <- tm_map(corpus, removeNumbers) # remove numbers
corpus <- tm_map(corpus, removeWords, stopwords("english")) # remove English stop words

# Create a DocumentTerm Matrix
dtm <- DocumentTermMatrix(corpus)

# Use multiple steps to...
corpus_words <- tidy(dtm) %>% # ... transform the dtm to a tidy object
  bind_tf_idf(term, document, count) # ... use the tf_idf function from tidytext to calculate 

total_words <- corpus_words %>% group_by(document) %>% summarize(total = sum(count)) # Calculate the number of words in each document
corpus_words <- left_join(corpus_words, total_words) # add it to the table

# Get the words of interest from the dictionary and rename the columns
dictionary <- read_xlsx("C:/Users/ple.si/Dropbox (CBS)/Manegerial Digital Attention (MAD)/New set 10K/DictionaryLIWCDigital_OnlyDigital_TG.xlsx", col_names = FALSE)
names(dictionary) <- c("term", "group")

# Take the individual term lists
inno_terms    <- dictionary$term[dictionary$group==1]
techno_terms  <- dictionary$term[dictionary$group==2]
data_terms    <- dictionary$term[dictionary$group==3]
digital_terms <- dictionary$term[dictionary$group==4]

# Filter the corpus for the words of interest
TF_IDF_Inno_terms2 <- corpus_words %>% filter(grepl(paste(inno_terms, collapse = "|"), term))
TF_IDF_techno_terms2 <- corpus_words %>% filter(grepl(paste(techno_terms, collapse = "|"), term))
TF_IDF_data_terms2 <- corpus_words %>% filter(grepl(paste(data_terms, collapse = "|"), term))
TF_IDF_digital_terms2 <- corpus_words %>% filter(grepl(paste(digital_terms, collapse = "|"), term))

我尝试了此代码的不同变体来解决多字问题,但没有成功。

#ATTEMP 2#

# Function to check if all multi-word terms are present in a document
check_multiword <- function(doc, multiword_terms) {
  all_terms_present <- all(sapply(multiword_terms, function(term) grepl(term, doc)))
  return(all_terms_present)
}

# Filter the corpus for documents containing multi-word terms of interest
docs_with_multiword_inno <- Filter(function(doc) check_multiword(doc, inno_terms), corpus)
docs_with_multiword_techno <- Filter(function(doc) check_multiword(doc, techno_terms), corpus)
docs_with_multiword_data <- Filter(function(doc) check_multiword(doc, data_terms), corpus)
docs_with_multiword_digital <- Filter(function(doc) check_multiword(doc, digital_terms), corpus)

#ATTEMPT 3#

# Filter the corpus words for the documents containing multi-word terms of interest
corpus_words_inno <- corpus_words %>% filter(document %in% docs_with_multiword_inno)
corpus_words_techno <- corpus_words %>% filter(document %in% docs_with_multiword_techno)
corpus_words_data <- corpus_words %>% filter(document %in% docs_with_multiword_data)
corpus_words_digital <- corpus_words %>% filter(document %in% docs_with_multiword_digital)

#ATTEMPT 4#

# Function to check if all multi-word terms are present in a document
check_multiword <- function(doc, multiword_terms) {
  all_terms_present <- all(sapply(multiword_terms, function(term) grepl(paste0("\\b", term, "\\b"), doc, ignore.case = TRUE)))
  return(all_terms_present)
}

# Filter the corpus for documents containing multi-word terms of interest
docs_with_multiword_inno <- Filter(function(doc) check_multiword(doc, inno_terms), corpus)
docs_with_multiword_techno <- Filter(function(doc) check_multiword(doc, techno_terms), corpus)
docs_with_multiword_data <- Filter(function(doc) check_multiword(doc, data_terms), corpus)
docs_with_multiword_digital <- Filter(function(doc) check_multiword(doc, digital_terms), corpus)

# Filter the corpus words for the documents containing multi-word terms of interest
corpus_words_inno <- corpus_words %>% filter(document %in% docs_with_multiword_inno)
corpus_words_techno <- corpus_words %>% filter(document %in% docs_with_multiword_techno)
corpus_words_data <- corpus_words %>% filter(document %in% docs_with_multiword_data)
corpus_words_digital <- corpus_words %>% filter(document %in% docs_with_multiword_digital)

#ATTEMPT 5#

# Create the corpus and clean it up a bit
corpus <- Corpus(DirSource(wd, recursive = TRUE)) # Create corpus
corpus <- tm_map(corpus, removePunctuation) # remove punctuation
corpus <- tm_map(corpus, removeNumbers) # remove numbers
corpus <- tm_map(corpus, removeWords, stopwords("english")) # remove English stop words

# Tokenize the text into n-grams (multiword terms)
multiword_terms <- c("new product", "new products", "new technologies", "new services",
                     "new solutions", "renew services", "artificial intelligence",
                     "machine learning", "data technology", "data security",
                     "data protection", "personal data", "data collection",
                     "store data", "internal data", "external data", "data privacy",
                     "data centers", "data driven", "customer data", "data from customer",
                     "data science", "data collection", "data analysis", "big data",
                     "market data", "data sets")
custom_tokenizer <- function(x) {
  unlist(lapply(ngrams(words(x), n = 1:2), paste, collapse = " "))
}

corpus <- tm_map(corpus, content_transformer(custom_tokenizer))

# Create a DocumentTerm Matrix
dtm <- DocumentTermMatrix(corpus)

# Get the words of interest from the dictionary and rename the columns
dictionary <- read_xlsx("C:/Users/ple.si/Dropbox (CBS)/Manegerial Digital Attention (MAD)/New set 10K/DictionaryLIWCDigital_OnlyDigital_TG.xlsx", col_names = FALSE)
names(dictionary) <- c("term", "group")

# Take the individual term lists
inno_terms    <- c(dictionary$term[dictionary$group == 1], multiword_terms)
techno_terms  <- c(dictionary$term[dictionary$group == 2], multiword_terms)
data_terms    <- c(dictionary$term[dictionary$group == 3], multiword_terms)
digital_terms <- c(dictionary$term[dictionary$group == 4], multiword_terms)

# Use multiple steps to...
corpus_words <- tidy(dtm) %>% # ... transform the dtm to a tidy object
  bind_tf_idf(term, document, count) # ... use the tf_idf function from tidytext to calculate 

total_words <- corpus_words %>% group_by(document) %>% summarize(total = sum(count)) # Calculate the number of words in each document
corpus_words <- left_join(corpus_words, total_words) # add it to the table

# Filter the corpus for the words of interest
TF_IDF_Inno_terms5 <- corpus_words %>% filter(term %in% inno_terms)
TF_IDF_techno_terms5 <- corpus_words %>% filter(term %in% techno_terms)
TF_IDF_data_terms5 <- corpus_words %>% filter(term %in% data_terms)
TF_IDF_digital_terms5 <- corpus_words %>% filter(term %in% digital_terms)

无论代码如何,结果都不考虑多字。

谢谢:)

r nlp tidyverse tf-idf tidytext
1个回答
0
投票

如果您想分析与单个单词不同的标记化级别的文本,您需要自己设置该标记化(可能不使用

Corpus()
)。

你可以找到这样的二元组:

library(tidyverse)
library(tidytext)

tidy_bigrams <- janeaustenr::austen_books() |> 
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

bigram_counts <- tidy_bigrams |> 
  count(book, bigram, sort = TRUE) |> 
  filter(!is.na(bigram))

bigram_counts
#> # A tibble: 300,903 × 3
#>    book                bigram     n
#>    <fct>               <chr>  <int>
#>  1 Mansfield Park      of the   712
#>  2 Mansfield Park      to be    612
#>  3 Emma                to be    586
#>  4 Mansfield Park      in the   533
#>  5 Emma                of the   529
#>  6 Pride & Prejudice   of the   439
#>  7 Emma                it was   430
#>  8 Pride & Prejudice   to be    422
#>  9 Sense & Sensibility to be    418
#> 10 Emma                in the   416
#> # ℹ 300,893 more rows

创建于 2023-08-15,使用 reprex v2.0.2

然后你可以像这样计算这些二元组的 tf-idf:

bigram_counts |> 
  bind_tf_idf(bigram, book, n) |> 
  arrange(desc(tf_idf))
#> # A tibble: 300,903 × 6
#>    book                bigram                n      tf   idf  tf_idf
#>    <fct>               <chr>             <int>   <dbl> <dbl>   <dbl>
#>  1 Pride & Prejudice   mr darcy            230 0.00206  1.79 0.00370
#>  2 Persuasion          captain wentworth   143 0.00187  1.79 0.00335
#>  3 Mansfield Park      sir thomas          266 0.00181  1.79 0.00324
#>  4 Persuasion          mr elliot           133 0.00174  1.79 0.00312
#>  5 Sense & Sensibility mrs jennings        185 0.00169  1.79 0.00303
#>  6 Emma                mr knightley        239 0.00162  1.79 0.00291
#>  7 Persuasion          lady russell        110 0.00144  1.79 0.00258
#>  8 Persuasion          sir walter          108 0.00141  1.79 0.00253
#>  9 Emma                mrs weston          208 0.00141  1.79 0.00253
#> 10 Mansfield Park      miss crawford       196 0.00133  1.79 0.00239
#> # ℹ 300,893 more rows

创建于 2023-08-15,使用 reprex v2.0.2

创建于 2023-08-15,使用 reprex v2.0.2

如果您想包含一元词(单词)和二元词,您可以在调用

n = 2, n_min = 1
时使用
unnest_tokens()

© www.soinside.com 2019 - 2024. All rights reserved.