我有一个专业术语的数据框,一个用户的数据框和一个论坛帖子的数据框,其中带有R中的文本。目标是选择一组使用术语频率大于0.005并具有术语频率的重要用户在他们的帖子中使用了超过30次的专业用语。到目前为止,这是代码,但是我的ifelse子句没有考虑到相关单词必须来自每个用户。
任何帮助将不胜感激。
jargon_words = "man", "heavy", "today", "last_night", "total"
term_frequency_tf = "hello", "old", "today", "total"
unique_users = "Xman" "23mate", "Hslim", "jacob6"
forum_posts = "hi my name is jeff", "whatsup doc", "hi mate today"
#Convert jargon words to tibble
jargon_words = as.data.frame(jargon_words)
jargon_words = as_tibble(jargon_words)
#Rename the first jargon_words column
names(jargon_words)[1] = 'text'
jargon_words$text = as.character(jargon_words$text)
class(jargon_words$text)
#Stem jargon words
jargon_words = jargon_words %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
mutate(word_stem = SnowballC::wordStem(word))
#Remove 'words' column
jargon_words = jargon_words[,-1]
jargon_words
#Convert jargon words to dataframe
jargon_words = as.data.frame(jargon_words)
#Select forum posts and add 'doc_id' column
forum_posts = df %>% select(1)
forum_posts$doc_id = seq.int(nrow(forum_posts))
#Rename the first forum_posts column
names(forum_posts)[1] = 'text'
#Convert to tibble
forum_posts = as_tibble(forum_posts)
#Remove punctuation
forum_posts$text = str_replace_all(forum_posts$text, "[[:punct:]]", " ")
#Remove digits
forum_posts$text = str_replace_all(forum_posts$text, "[[:digit:]]", " ")
#Trim whitespace
forum_posts$text = str_trim(forum_posts$text, side = c("both"))
#Calculate term frequency (tf) for all stemmed words
library(SnowballC)
term_frequency = forum_posts %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
mutate(word_stem = SnowballC::wordStem(word)) %>%
group_by(doc_id) %>%
count(word_stem) %>%
group_by(doc_id) %>%
mutate(post_sum = sum(n)) %>%
bind_tf_idf(word_stem, post_sum, n) %>%
arrange(desc(tf))
#Convert to dataframe
term_frequency = as.data.frame(term_frequency)
#Select words with a term_frequency greater than 0.005
term_frequency_tf = term_frequency$word_stem[term_frequency$tf > 0.005]
#Select unique users
unique_users = levels(df$user)
#Select important users
important_users = vector()
for (user in unique_users){
condition = (df$user == user)
text = forum_posts$text[condition]
relevant_words = term_frequency_tf
x = intersect(jargon_words, relevant_words)
if (length(x) > 30){
print(user)
important_users = c(important_users, user)
}
}
安娜,请从以前的帐户查看您对原始问题的回答: