使用带有filter和str_detect函数的关键字列表进行不完整的过滤过程

问题描述 投票:0回答:1

这是我的长格式 df:

Tube_patient      variable   value
1 S_PUC_H_001_(1)_G4 WT_Spike_igg1 1053305
2 S_PUC_H_003_(1)_G4 WT_N_igg1       0
3 S_PUC_H_004_(2)_G4 WT_RBD_igg1       0
4 S_PUC_H_006_(8)_G4 IgG1_WT_Spike       0
5 S_PUC_H_018_(1)_G4 IgG1_WT_N       0
6 S_PUC_H_028_(6)_G4 IgG1_WT_RBD 1961326

这里有一个使用前 5 行和最后 5 行重现问题的选项:

> dput(df.infected.melted[c(1:5, 4635:4640), ])
structure(list(Tube_patient = c("S_PUC_H_001_(1)_G4", "S_PUC_H_003_(1)_G4", 
"S_PUC_H_004_(2)_G4", "S_PUC_H_006_(8)_G4", "S_PUC_H_018_(1)_G4", 
"S_PUC_O_007_(6)_G4", "S_PUC_O_025_(1)_G4", "S_PUC_O_038_(1)_G4", 
"S_PUC_O_057_(12)_G4", "S_PUC_O_106_(7)_G4", "S_PUC_O_107_(3)_G4"
), variable = structure(c(1L, 1L, 1L, 1L, 1L, 290L, 290L, 290L, 
290L, 290L, 290L), levels = c("WT_Spike_igg1", "WTS1_igg1", "WT_RBD_igg1", 
"WTS2_igg1", "WT_NTD_igg1", "Alpha_Spike_igg1", "Beta_RBD_igg1", 
"Beta_Spike_igg1", "Delta_Spike_igg1", "Gamma_Spike_igg1", "Alpha_RBD_igg1", 
"Gamma_RBD_igg1", "Ebola_igg1", "Delta_RBD_igg1", "WT_N_igg1", 
"WT_Spike_igg2", "WTS1_igg2", "WT_RBD_igg2", "WTS2_igg2", "WT_NTD_igg2", 
"Alpha_Spike_igg2", "Beta_RBD_igg2", "Beta_Spike_igg2", "Delta_Spike_igg2", 
"Gamma_Spike_igg2", "Alpha_RBD_igg2", "Gamma_RBD_igg2", "Ebola_igg2", 
"Delta_RBD_igg2", "WT_N_igg2", "WT_Spike_igg3", "WTS1_igg3", 
"WT_RBD_igg3", "WTS2_igg3", "WT_NTD_igg3", "Alpha_Spike_igg3", 
"Beta_RBD_igg3", "Beta_Spike_igg3", "Delta_Spike_igg3", "Gamma_Spike_igg3", 
"Alpha_RBD_igg3", "Gamma_RBD_igg3", "Ebola_igg3", "Delta_RBD_igg3", 
"WT_N_igg3", "WT_Spike_igg4", "WTS1_igg4", "WT_RBD_igg4", "WTS2_igg4", 
"WT_NTD_igg4", "Alpha_Spike_igg4", "Beta_RBD_igg4", "Beta_Spike_igg4", 
"Delta_Spike_igg4", "Gamma_Spike_igg4", "Alpha_RBD_igg4", "Gamma_RBD_igg4", 
"Ebola_igg4", "Delta_RBD_igg4", "WT_N_igg4", "WT_Spike_iga1", 
"WTS1_iga1", "WT_RBD_iga1", "WTS2_iga1", "WT_NTD_iga1", "Alpha_Spike_iga1", 
"Beta_RBD_iga1", "Beta_Spike_iga1", "Delta_Spike_iga1", "Gamma_Spike_iga1", 
"Alpha_RBD_iga1", "Gamma_RBD_iga1", "Ebola_iga1", "Delta_RBD_iga1", 
"WT_N_iga1", "WT_Spike_igm", "WTS1_igm", "WT_RBD_igm", "WTS2_igm", 
"WT_NTD_igm", "Alpha_Spike_igm", "Beta_RBD_igm", "Beta_Spike_igm", 
"Delta_Spike_igm", "Gamma_Spike_igm", "Alpha_RBD_igm", "Gamma_RBD_igm", 
"Ebola_igm", "Delta_RBD_igm", "WT_N_igm", "WT_Spike_FcgR2A", 
"WTS1_FcgR2A", "WT_RBD_FcgR2A", "WTS2_FcgR2A", "WT_NTD_FcgR2A", 
"Alpha_Spike_FcgR2A", "Beta_RBD_FcgR2A", "Beta_Spike_FcgR2A", 
"Delta_Spike_FcgR2A", "Gamma_Spike_FcgR2A", "Alpha_RBD_FcgR2A", 
"Gamma_RBD_FcgR2A", "Ebola_FcgR2A", "Delta_RBD_FcgR2A", "WT_N_FcgR2A", 
"WT_Spike_FcgR2B", "WTS1_FcgR2B", "WT_RBD_FcgR2B", "WTS2_FcgR2B", 
"WT_NTD_FcgR2B", "Alpha_Spike_FcgR2B", "Beta_RBD_FcgR2B", "Beta_Spike_FcgR2B", 
"Delta_Spike_FcgR2B", "Gamma_Spike_FcgR2B", "Alpha_RBD_FcgR2B", 
"Gamma_RBD_FcgR2B", "Ebola_FcgR2B", "Delta_RBD_FcgR2B", "WT_N_FcgR2B", 
"WT_Spike_FcgR3A", "WTS1_FcgR3A", "WT_RBD_FcgR3A", "WTS2_FcgR3A", 
"WT_NTD_FcgR3A", "Alpha_Spike_FcgR3A", "Beta_RBD_FcgR3A", "Beta_Spike_FcgR3A", 
"Delta_Spike_FcgR3A", "Gamma_Spike_FcgR3A", "Alpha_RBD_FcgR3A", 
"Gamma_RBD_FcgR3A", "Ebola_FcgR3A", "Delta_RBD_FcgR3A", "WT_N_FcgR3A", 
"WT_Spike_FcgR3B", "WTS1_FcgR3B", "WT_RBD_FcgR3B", "WTS2_FcgR3B", 
"WT_NTD_FcgR3B", "Alpha_Spike_FcgR3B", "Beta_RBD_FcgR3B", "Beta_Spike_FcgR3B", 
"Delta_Spike_FcgR3B", "Gamma_Spike_FcgR3B", "Alpha_RBD_FcgR3B", 
"Gamma_RBD_FcgR3B", "Ebola_FcgR3B", "Delta_RBD_FcgR3B", "WT_N_FcgR3B", 
"IgG1_WT_RBD", "IgG1_WT_Spike", "IgG1_S1", "IgG1_S2", "IgG1_NTD", 
"IgG1_N", "IgG1_Alpha_RBD", "IgG1_Alpha_Spike", "IgG1_Beta_RBD", 
"IgG1_Beta_Spike", "IgG1_Gamma_RBD", "IgG1_Gamma_Spike", "IgG1_Delta_RBD", 
"IgG1_Delta_Spike", "IgG2_WT_RBD", "IgG2_WT_Spike", "IgG2_S1", 
"IgG2_S2", "IgG2_NTD", "IgG2_N", "IgG2_Alpha_RBD", "IgG2_Alpha_Spike", 
"IgG2_Beta_RBD", "IgG2_Beta_Spike", "IgG2_Gamma_RBD", "IgG2_Gamma_Spike", 
"IgG2_Delta_RBD", "IgG2_Delta_Spike", "IgG3_WT_RBD", "IgG3_WT_Spike", 
"IgG3_S1", "IgG3_S2", "IgG3_NTD", "IgG3_N", "IgG3_Alpha_RBD", 
"IgG3_Alpha_Spike", "IgG3_Beta_RBD", "IgG3_Beta_Spike", "IgG3_Gamma_RBD", 
"IgG3_Gamma_Spike", "IgG3_Delta_RBD", "IgG3_Delta_Spike", "IgG4_WT_RBD", 
"IgG4_WT_Spike", "IgG4_S1", "IgG4_S2", "IgG4_NTD", "IgG4_N", 
"IgG4_Alpha_RBD", "IgG4_Alpha_Spike", "IgG4_Beta_RBD", "IgG4_Beta_Spike", 
"IgG4_Gamma_RBD", "IgG4_Gamma_Spike", "IgG4_Delta_RBD", "IgG4_Delta_Spike", 
"IgA_WT_RBD", "IgA_WT_Spike", "IgA_S1", "IgA_S2", "IgA_NTD", 
"IgA_N", "IgA_Alpha_RBD", "IgA_Alpha_Spike", "IgA_Beta_RBD", 
"IgA_Beta_Spike", "IgA_Gamma_RBD", "IgA_Gamma_Spike", "IgA_Delta_RBD", 
"IgA_Delta_Spike", "IgM_WT_RBD", "IgM_WT_Spike", "IgM_S1", "IgM_S2", 
"IgM_NTD", "IgM_N", "IgM_Alpha_RBD", "IgM_Alpha_Spike", "IgM_Beta_RBD", 
"IgM_Beta_Spike", "IgM_Gamma_RBD", "IgM_Gamma_Spike", "IgM_Delta_RBD", 
"IgM_Delta_Spike", "FcgR2A_WT_RBD", "FcgR2A_WT_Spike", "FcgR2A_S1", 
"FcgR2A_S2", "FcgR2A_NTD", "FcgR2A_N", "FcgR2A_Alpha_RBD", "FcgR2A_Alpha_Spike", 
"FcgR2A_Beta_RBD", "FcgR2A_Beta_Spike", "FcgR2A_Gamma_RBD", "FcgR2A_Gamma_Spike", 
"FcgR2A_Delta_RBD", "FcgR2A_Delta_Spike", "FcgR2B_WT_RBD", "FcgR2B_WT_Spike", 
"FcgR2B_S1", "FcgR2B_S2", "FcgR2B_NTD", "FcgR2B_N", "FcgR2B_Alpha_RBD", 
"FcgR2B_Alpha_Spike", "FcgR2B_Beta_RBD", "FcgR2B_Beta_Spike", 
"FcgR2B_Gamma_RBD", "FcgR2B_Gamma_Spike", "FcgR2B_Delta_RBD", 
"FcgR2B_Delta_Spike", "FcgR3A_WT_RBD", "FcgR3A_WT_Spike", "FcgR3A_S1", 
"FcgR3A_S2", "FcgR3A_NTD", "FcgR3A_N", "FcgR3A_Alpha_RBD", "FcgR3A_Alpha_Spike", 
"FcgR3A_Beta_RBD", "FcgR3A_Beta_Spike", "FcgR3A_Gamma_RBD", "FcgR3A_Gamma_Spike", 
"FcgR3A_Delta_RBD", "FcgR3A_Delta_Spike", "FcgR3B_WT_RBD", "FcgR3B_WT_Spike", 
"FcgR3B_S1", "FcgR3B_S2", "FcgR3B_NTD", "FcgR3B_N", "FcgR3B_Alpha_RBD", 
"FcgR3B_Alpha_Spike", "FcgR3B_Beta_RBD", "FcgR3B_Beta_Spike", 
"FcgR3B_Gamma_RBD", "FcgR3B_Gamma_Spike", "FcgR3B_Delta_RBD", 
"FcgR3B_Delta_Spike"), class = "factor"), value = c(1053304.5625, 
0, 0, 0, 0, 254.552083333333, 121296.802083333, 95.5520833333335, 
174257.552083333, 248007, 75025.5), new_old = c("old", "old", 
"old", "old", "old", "new", "new", "new", "new", "new", "new"
)), row.names = c(1L, 2L, 3L, 4L, 5L, 4635L, 4636L, 4637L, 4638L, 
4639L, 4640L), class = "data.frame")

在这里,我尝试执行过滤过程,将这个长 df 分成小 df,其中仅包含

variable column
中的关键字,使用以下代码:

# PERFORM THE FILTERING PROCESS
# Define a list of each analyte
keywords_boxplot <- c('Spike', 'WTS1', 'WTS2', 'RBD', 'N', 'NTD')
# Filter the df by string present in the previous list
filtered_dfs <- lapply(keywords_boxplot, function(keyword) {
  pattern <- paste0(keyword, "_")
  df.infected.melted %>% filter(str_detect(variable, pattern))
})
# For each string filtered, you can create a new df
names(filtered_dfs) <- paste0("df.", keywords_boxplot)
# Create each df in the global environment
list2env(filtered_dfs, envir = .GlobalEnv)```
The issue here is that in the ```variable``` column I do have not a common pattern in the position of the keywords (Spike, WST1, and so on), for example (I think that this could be a potential explanation of why my code is not working properly):

字符串开头的关键字 (WST1) WTS1_igg2

字符串中间的关键字(SPIKE) Alpha_Spike_FcgR3B

字符串末尾的关键字 (RBD) FcgR2B_Gamma_RBD

So the idea is to use the presence of the keyword in the list to filter the ```variable column```: 

keywords_箱线图 <- c('Spike', 'WTS1', 'WTS2', 'RBD', 'N', 'NTD')

What can I do to be less strict in the code and detect the keywords of interest? 
At the same time, I need to differentiate between N versus NTD, for example.


For some reason my code is doing the task with the ```Spike``` keyword but not with N or NTD, with WTS1 only recognizing this type of string (this is only with WST1 and WST2, for example): 

字符串开头的关键字


Any comment, idea, or correction, is more than welcome!
r
1个回答
0
投票

你的问题是你的模式并不总是前面或后面有下划线——有时它们会开始或结束字符串。我们可以在正则表达式中使用

"(_|^)"
来匹配下划线或字符串的开头,使用
"(_|$)"
来匹配下划线或字符串的结尾。

正如评论中提到的,我还将

apply
更改为
lapply
,这似乎更合适(并且不会导致错误)。

最后一点,我强烈建议您不要使用

list2env
作为最后一步。如果您将数据保存在
list
中,接下来的步骤将会更容易。

library(dplyr)
library(stringr)
keywords_boxplot <- c('Spike', 'WTS1', 'WTS2', 'RBD', 'N', 'NTD')
# Filter the df by string present in the previous list
filtered_dfs <- lapply(keywords_boxplot, function(keyword) {
  pattern <- paste0("(_|^)", keyword, "(_|$)")
  df.infected.melted %>% filter(str_detect(variable, pattern))
})

names(filtered_dfs) <- paste0("df.", keywords_boxplot)
filtered_dfs
# $df.Spike
#           Tube_patient           variable        value new_old
# 1   S_PUC_H_001_(1)_G4      WT_Spike_igg1 1.053305e+06     old
# 2   S_PUC_H_003_(1)_G4      WT_Spike_igg1 0.000000e+00     old
# 3   S_PUC_H_004_(2)_G4      WT_Spike_igg1 0.000000e+00     old
# 4   S_PUC_H_006_(8)_G4      WT_Spike_igg1 0.000000e+00     old
# 5   S_PUC_H_018_(1)_G4      WT_Spike_igg1 0.000000e+00     old
# 6   S_PUC_O_007_(6)_G4 FcgR3B_Delta_Spike 2.545521e+02     new
# 7   S_PUC_O_025_(1)_G4 FcgR3B_Delta_Spike 1.212968e+05     new
# 8   S_PUC_O_038_(1)_G4 FcgR3B_Delta_Spike 9.555208e+01     new
# 9  S_PUC_O_057_(12)_G4 FcgR3B_Delta_Spike 1.742576e+05     new
# 10  S_PUC_O_106_(7)_G4 FcgR3B_Delta_Spike 2.480070e+05     new
# 11  S_PUC_O_107_(3)_G4 FcgR3B_Delta_Spike 7.502550e+04     new
# 
# $df.WTS1
# [1] Tube_patient variable     value        new_old     
# <0 rows> (or 0-length row.names)
# 
# $df.WTS2
# [1] Tube_patient variable     value        new_old     
# <0 rows> (or 0-length row.names)
# 
# $df.RBD
# [1] Tube_patient variable     value        new_old     
# <0 rows> (or 0-length row.names)
# 
# $df.N
# [1] Tube_patient variable     value        new_old     
# <0 rows> (or 0-length row.names)
# 
# $df.NTD
# [1] Tube_patient variable     value        new_old     
# <0 rows> (or 0-length row.names)
© www.soinside.com 2019 - 2024. All rights reserved.