R tm包在文本语料库中选取大量的词来保存。

问题描述 投票:0回答:1

我有大约70.000 frequent_words 我想在一个文本语料库中按照它们出现的顺序保存(顺序很重要)。我得到的是这样的。

dtm <- DocumentTermMatrix(txt_corpus, control = list(wordLengths=c(1, Inf)))
frequent_words <- findFreqTerms(dtm, lowfreq=50)

就这样做:

dtm <- DocumentTermMatrix(txt_corpus, control = list(wordLengths=c(1, Inf)))
dtm <- removeSparseTerms(dtm, 0.8)

这样做是行不通的,因为我需要相同的过滤后的 text_corpus 两次。

dtm <- DocumentTermMatrix(txt_corpus, control = list(wordLengths=c(1, Inf)))
BigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
bidtm <- DocumentTermMatrix(txt_corpus, control = list(tokenize = BigramTokenizer))

我试了下面的代码。

keepWords <- content_transformer(function(x, words) {
  regmatches(x,
             gregexpr(paste0("(\\b",  paste(words, collapse = "\\b|\\b"), "\\b)"), x, perl = T, ignore.case=T, useBytes = T)
             , invert = T) <- " "
  return(x)
})
txt_corpus <- tm_map(txt_corpus, keepWords, frequent_words)

当我运行它时,我得到了错误。

Error in gregexpr(paste0("(\\b", paste(words, collapse = "\\b|\\b"), "\\b)"),  : 
  assertion 'tree->num_tags == num_tags' failed in executing regexp: file 'tre-compile.c', line 634
Calls: preprocess ... tm_parLapply -> lapply -> FUN -> FUN -> regmatches<- -> gregexpr
Execution halted

这是由于长的正则表达式造成的。去除非频繁出现的单词是不可能的,因为 length(less_frequent_words) > 1.000.000和需要很长的时间与。

chunk <- 500
n <- length(less_frequent_words)
r <- rep(1:ceiling(n/chunk),each=chunk)[1:n]
d <- split(less_frequent_words, r)

for (i in 1:length(d)) {
  txt_corpus <- tm_map(txt_corpus, removeWords, c(paste(d[[i]])))
}

我也尝试了一些加入的方法,但每次迭代都会得到一个独特的文本语料。

chunk <- 500
n <- length(frequent_words)
r <- rep(1:ceiling(n/chunk),each=chunk)[1:n]
d <- split(frequent_words, r)

joined_txt_corpus <- VCorpus(VectorSource(list()))
for (i in 1:length(d)) {
  new_corpus <- tm_map(txt_corpus, keepWords, c(paste(d[[i]])))
  joined_txt_corpus <- c(joined_txt_corpus, new_corpus)
  txt_corpus <- tm_map(txt_corpus, removeWords, c(paste(d[[i]])))
}
txt_corpus <- joined_txt_corpus

有没有一种有效的方法来做同样的选择,就像: text_corpus <- tm_map(txt_corpus, keepWords, frequent_words) 但字数多?希望得到任何帮助和提示! 谢谢!

可复制的例子。

library(tm)
data(crude)

txt_corpus <- crude

txt_corpus <- tm_map(txt_corpus, content_transformer(tolower))
txt_corpus <- tm_map(txt_corpus, removePunctuation)
txt_corpus <- tm_map(txt_corpus, stripWhitespace)

article_words <- c("a", "an", "the")
txt_corpus <- tm_map(txt_corpus, removeWords, article_words)
txt_corpus <- tm_map(txt_corpus, removeNumbers)

dtm <- DocumentTermMatrix(txt_corpus, control = list(wordLengths=c(1, Inf)))
frequent_words <- findFreqTerms(dtm, lowfreq=80)
dtm <- DocumentTermMatrix(txt_corpus, control = list(wordLengths=c(1, Inf), dictionary=frequent_words))

# Use many words just using frequent_words once works
# frequent_words <- c(frequent_words, frequent_words, frequent_words, frequent_words)

# keepWords function
keepWords <- content_transformer(function(x, words) {
  regmatches(x,
             gregexpr(paste0("(\\b",  paste(words, collapse = "\\b|\\b"), "\\b)"), x, perl = T, ignore.case=T)
             , invert = T) <- " "
  return(x)
})

txt_corpus <- tm_map(txt_corpus, keepWords, frequent_words)

# Get bigram from text_corpus
BigramTokenizer <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)
bidtm <- DocumentTermMatrix(txt_corpus, control = list(tokenize = BigramTokenizer))
bidtmm <- col_sums(bidtm)
bidtmm <- as.matrix(bidtmm)
print(bidtmm)

输出:

        [,1]
in in     14
in of     21
in oil    19
in to     28
of in     21
of of     20
of oil    20
of to     29
oil in    18
oil of    18
oil oil   13
oil to    33
to in     32
to of     35
to oil    21
to to     41
r tm
1个回答
1
投票

我看了你的要求,也许结合到tm和quanteda可以帮助你。请看下面的内容。

一旦你有了一个经常使用的单词列表,你就可以并行使用quanteda来获得大词。

library(quanteda)

# set number of threads 
quanteda_options(threads = 4) 

my_corp <- corpus(crude) # corpus from tm can be used here (txt_corpus)
my_toks <- tokens(my_corp, remove_punct = TRUE) # add extra removal if needed

# Use list of frequent words from tm. 
# speed gain should occur here
my_toks <- tokens_keep(my_toks, frequent_words)

# ngrams, concatenator is _ by default
bitoks <- tokens_ngrams(my_toks)

textstat_frequency(dfm(bitoks)) # ordered from high to low

   feature frequency rank docfreq group
1    to_to        41    1      12   all
2    to_of        35    2      15   all
3   oil_to        33    3      17   all
4    to_in        32    4      12   all
5    of_to        29    5      14   all
6    in_to        28    6      11   all
7    in_of        21    7       8   all
8   to_oil        21    7      13   all
9    of_in        21    7      10   all
10  of_oil        20   10      14   all
11   of_of        20   10       8   all
12  in_oil        19   12      10   all
13  oil_in        18   13      11   all
14  oil_of        18   13      11   all
15   in_in        14   15       9   all
16 oil_oil        13   16      10   all

quanteda确实有一个 topfeatures 函数,但它不像 findfreqterms. 否则你可以完全用quanteda来做。

如果 dfm 生成 token 占用了太多的内存,您可以使用 as.character 来转换 token 对象,然后在 dplyr 或 data.table 中使用。见下面的代码。

library(dplyr)
out_dp <- tibble(features = as.character(bitoks)) %>% 
  group_by(features) %>% 
  tally()


library(data.table)
out_dt <- data.table(features = as.character(bitoks))
out_dt <- out_dt[, .N, by = features]
© www.soinside.com 2019 - 2024. All rights reserved.