
问题描述 投票:7回答:3


texts.dt <- data.table(string = c("one", 
                                  "two words",
                                  "three words here",
                                  "four useless words here", 
                                  "five useless meaningless words here", 
                                  "six useless meaningless words here just",
                                  "seven useless meaningless words here just to",
                                  "eigth useless meaningless words here just to fill",
                                  "nine useless meaningless words here just to fill up",
                                  "ten useless meaningless words here just to fill up space"),
                       word.count = 1:10,
                       stop.at.word = c(0, 1, 2, 2, 4, 3, 3, 6, 7, 5))


                                                          string word.count stop.at.word
 1:                                                      one          1            0
 2:                                                two words          2            1
 3:                                         three words here          3            2
 4:                                  four useless words here          4            2
 5:                      five useless meaningless words here          5            4
 6:                  six useless meaningless words here just          6            3
 7:             seven useless meaningless words here just to          7            3
 8:        eigth useless meaningless words here just to fill          8            6
 9:      nine useless meaningless words here just to fill up          9            7
10: ten useless meaningless words here just to fill up space         10            5

在实际应用中,stop.at.word列中的值是随机确定的(上限= word.count - 1)。此外,字符串不按长度排序,但这不应该有所不同。

代码应该添加两列inputand output,其中inputcon包含从位置1到stop.at.wordand output的子字符串包含后面的单词(单个单词),如下所示:

                                                          string word.count stop.at.word                                       input
     1:                                                      one          1            0                                            
     2:                                                two words          2            1                                         two
     3:                                         three words here          3            2                                 three words
     4:                                  four useless words here          4            2                                four useless
     5:                      five useless meaningless words here          5            4              five useless meaningless words
     6:                  six useless meaningless words here just          6            2                                 six useless
     7:             seven useless meaningless words here just to          7            3                   seven useless meaningless
     8:        eigth useless meaningless words here just to fill          8            6   eigth useless meaningless words here just
     9:      nine useless meaningless words here just to fill up          9            7 nine useless meaningless words here just to
    10: ten useless meaningless words here just to fill up space         10            5          ten useless meaningless words here
     2:       words
     3:        here
     4:       words
     5:        here
     6: meaningless
     7:       words
     8:          to
     9:        fill
    10:        just


                                                      string word.count stop.at.word input output
 1:                                                      one          1            0             
 2:                                                two words          2            1    NA     NA
 3:                                         three words here          3            2    NA     NA
 4:                                  four useless words here          4            2    NA     NA
 5:                      five useless meaningless words here          5            4    NA     NA
 6:                  six useless meaningless words here just          6            3    NA     NA
 7:             seven useless meaningless words here just to          7            3    NA     NA
 8:        eigth useless meaningless words here just to fill          8            6    NA     NA
 9:      nine useless meaningless words here just to fill up          9            7    NA     NA
10: ten useless meaningless words here just to fill up space         10            5  ten      NA



    texts.dt[, c("input", "output") := .(
               sapply(gregexpr(" ", string),"[", stop.at.word) - 1),
               sapply(gregexpr(" ", string),"[", stop.at.word), 
               sapply(gregexpr(" ", string),"[", stop.at.word + 1) - 1)



r string data.table text-processing


texts.dt[stop.at.word > 0, c("input","output") := {
  sp = strsplit(string, " ")
    mapply(function(p,n) paste(p[seq_len(n)], collapse = " "), sp, stop.at.word),
    mapply(`[`, sp, stop.at.word+1L)

# partial result
head(texts.dt, 4)

                    string word.count stop.at.word        input output
1:                     one          1            0           NA     NA
2:               two words          2            1          two  words
3:        three words here          3            2  three words   here
4: four useless words here          4            2 four useless  words


texts.dt[stop.at.word > 0, c("input","output") := {
  patt = paste0("((\\w+ ){", stop.at.word-1, "}\\w+) (.*)")
  m    = stri_match(string, regex = patt)
  list(m[, 2], m[, 4])

dt[, `:=`(input  = sub(paste0('((\\s*\\w+){', stop.at.word, '}).*'), '\\1', string),
          output = sub(paste0('(\\s*\\w+){', stop.at.word, '}\\s*(\\w+).*'), '\\2', string))
   , by = stop.at.word][]
#                                                      string word.count stop.at.word
# 1:                                                      one          1            0
# 2:                                                two words          2            1
# 3:                                         three words here          3            2
# 4:                                  four useless words here          4            2
# 5:                      five useless meaningless words here          5            4
# 6:                  six useless meaningless words here just          6            3
# 7:             seven useless meaningless words here just to          7            3
# 8:        eigth useless meaningless words here just to fill          8            6
# 9:      nine useless meaningless words here just to fill up          9            7
#10: ten useless meaningless words here just to fill up space         10            5
#                                          input output
# 1:                                                one
# 2:                                         two  words
# 3:                                 three words   here
# 4:                                four useless  words
# 5:              five useless meaningless words   here
# 6:                     six useless meaningless  words
# 7:                   seven useless meaningless  words
# 8:   eigth useless meaningless words here just     to
# 9: nine useless meaningless words here just to   fill
#10:          ten useless meaningless words here   just



@ Frank的mapply解决方案的替代方案是使用by = 1:nrow(texts.dt)strsplitpaste

texts.dt[, `:=` (input = paste(strsplit(string, ' ')[[1]][1:stop.at.word][stop.at.word>0],
                               collapse = " "),
                 output = strsplit(string, ' ')[[1]][stop.at.word + 1]),
         by = 1:nrow(texts.dt)]


> texts.dt
                                                      string word.count stop.at.word                                       input output
 1:                                                      one          1            0                                                one
 2:                                                two words          2            1                                         two  words
 3:                                         three words here          3            2                                 three words   here
 4:                                  four useless words here          4            2                                four useless  words
 5:                      five useless meaningless words here          5            4              five useless meaningless words   here
 6:                  six useless meaningless words here just          6            3                     six useless meaningless  words
 7:             seven useless meaningless words here just to          7            3                   seven useless meaningless  words
 8:        eigth useless meaningless words here just to fill          8            6   eigth useless meaningless words here just     to
 9:      nine useless meaningless words here just to fill up          9            7 nine useless meaningless words here just to   fill
10: ten useless meaningless words here just to fill up space         10            5          ten useless meaningless words here   just

你可以将[[1]]包裹在strsplit中,而不是使用unlist,如下所示:unlist(strsplit(string, ' '))(而不是strsplit(string, ' ')[[1]])。这将给你相同的结果。



texts.dt[, `:=`(input = paste(stri_extract_all_words(string[stop.at.word>0],
                                                     simplify = TRUE)[1:stop.at.word],
                              collapse = " "),
                output = stri_extract_all_words(string[stop.at.word>0],
                                                simplify = TRUE)[stop.at.word+1]),

2)或来自this answer的改编:

         c('input','output') := tstrsplit(string, 
                                          split = paste0("(?=(?>\\s+\\S*){",
                                                         word.count - stop.at.word,
                                          perl = TRUE)
         ][, output := sub('(\\w+).*','\\1',output)]


> texts.dt
                                                      string word.count stop.at.word                                       input output
 1:                                                      one          1            0                                          NA     NA
 2:                                                two words          2            1                                         two  words
 3:                                         three words here          3            2                                 three words   here
 4:                                  four useless words here          4            2                                four useless  words
 5:                      five useless meaningless words here          5            4              five useless meaningless words   here
 6:                  six useless meaningless words here just          6            3                     six useless meaningless  words
 7:             seven useless meaningless words here just to          7            3                   seven useless meaningless  words
 8:        eigth useless meaningless words here just to fill          8            6   eigth useless meaningless words here just     to
 9:      nine useless meaningless words here just to fill up          9            7 nine useless meaningless words here just to   fill
10: ten useless meaningless words here just to fill up space         10            5          ten useless meaningless words here   just
© www.soinside.com 2019 - 2024. All rights reserved.