根据词频选择用户组

问题描述 投票:0回答:1

我有一个专业术语的数据框,一个用户的数据框和一个论坛帖子的数据框,其中带有R中的文本。目标是选择一组使用术语频率大于0.005并具有术语频率的重要用户在他们的帖子中使用了超过30次的专业用语。到目前为止,这是代码,但是我的ifelse子句没有考虑到相关单词必须来自每个用户。

任何帮助将不胜感激。

jargon_words = "man", "heavy", "today", "last_night", "total"
term_frequency_tf = "hello", "old", "today", "total" 
unique_users = "Xman" "23mate", "Hslim", "jacob6"
forum_posts = "hi my name is jeff", "whatsup doc", "hi mate today"


#Convert jargon words to tibble
jargon_words = as.data.frame(jargon_words)
jargon_words = as_tibble(jargon_words)

#Rename the first jargon_words column
names(jargon_words)[1] = 'text'

jargon_words$text = as.character(jargon_words$text)
class(jargon_words$text)

#Stem jargon words
jargon_words = jargon_words %>%
  unnest_tokens(word, text) %>% 
  anti_join(stop_words) %>% 
  mutate(word_stem = SnowballC::wordStem(word)) 

#Remove 'words' column
jargon_words = jargon_words[,-1]
jargon_words

#Convert jargon words to dataframe
jargon_words = as.data.frame(jargon_words)

#Select forum posts and add 'doc_id' column
forum_posts = df %>% select(1)
forum_posts$doc_id = seq.int(nrow(forum_posts))

#Rename the first forum_posts column
names(forum_posts)[1] = 'text'

#Convert to tibble
forum_posts = as_tibble(forum_posts)

#Remove punctuation
forum_posts$text = str_replace_all(forum_posts$text, "[[:punct:]]", " ")

#Remove digits
forum_posts$text = str_replace_all(forum_posts$text, "[[:digit:]]", " ")

#Trim whitespace
forum_posts$text = str_trim(forum_posts$text, side = c("both"))

#Calculate term frequency (tf) for all stemmed words
library(SnowballC)
term_frequency = forum_posts %>%
  unnest_tokens(word, text) %>% 
  anti_join(stop_words) %>% 
  mutate(word_stem = SnowballC::wordStem(word)) %>% 
  group_by(doc_id) %>% 
  count(word_stem) %>%
  group_by(doc_id) %>% 
  mutate(post_sum = sum(n)) %>%
  bind_tf_idf(word_stem, post_sum, n) %>% 
  arrange(desc(tf))

#Convert to dataframe
term_frequency = as.data.frame(term_frequency)

#Select words with a term_frequency greater than 0.005
term_frequency_tf = term_frequency$word_stem[term_frequency$tf > 0.005]

#Select unique users
unique_users = levels(df$user)

#Select important users
important_users = vector()
for (user in unique_users){
  condition = (df$user == user)
  text = forum_posts$text[condition]
  relevant_words = term_frequency_tf
  x = intersect(jargon_words, relevant_words)
  if (length(x) > 30){
    print(user)
    important_users = c(important_users, user)
  }
}

r if-statement select intersect word-frequency
1个回答
0
投票

安娜,请从以前的帐户查看您对原始问题的回答:

© www.soinside.com 2019 - 2024. All rights reserved.