如何修改分类字符串

问题描述 投票:0回答:1

我正在尝试修改使用 qiime 2 生成的分类字符串(Taxo_sliced)。在列分类单元中,我能够删除 1) 我不需要的 k_,p_,c_ a 标签 2) 将它们分成列 3) 将 NA 和“_”替换为“”。 现在我希望将任何空白空间替换为前一列中的分类,因此例如,如果门列为空,我希望它为 unclassified_kingdom(细菌),如果物种为空,我希望它采用 unclassified_genus(以是前一列中的前面的属)名称。我希望我能说清楚。下面是我到目前为止使用的代码以及从 dput 粘贴的数据。我将不胜感激任何帮助。

library(tidyverse)
dput(Taxo_sliced)

tax1 <-Taxo_sliced %>%
  mutate(taxon=str_replace_all(string=taxon, pattern="\\w__", replacement=""),stringsAsFactors=FALSE)

tax1 <- tax1 %>%
  separate(taxon, c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"), "; ")

tax1[is.na(tax1)] <- ""
tax1[tax1=="__"] <- ""

> dput(Taxo_sliced)
structure(list(otu.id = c("4abaa483334092f021534a979086baeb", 
"ffc36e27c82042664a16bcd4d380b286", "8eb4e34fd58ab95fa2efab34940c01fa", 
"70d55baf78e9ac4d0babeac5dcbae5c2", "c728ad6f5d183cb36fa06b6a3a47758b", 
"6b6d13f1332283e053c18c3b7774b85c", "df3d6113f855e22f2a6d44e60f01baa7", 
"4516aa60a483dd8c7bbc57098c45f1a5", "25af29e1b2d121f8aae468d270d75518", 
"520887a54cc76670a5f7f2e64a8a651b", "22f4ee9a41a4d73580bf7ade8e9e017a", 
"f664a73e7de6e00ff70e013369499cbd", "3f3a0eaeea9c0690b6ede1b17b4fd8ce", 
"b264ac8ff5f9aff74f0b9aa084d9a9f0", "e655845f5f4ce1633524c0c9a0b15927", 
"5859c644498bf396619fcca4ed014b8d", "f684347f6c1a5dcf941c4c17be9dfb8d", 
"e2f6a531bc64c892b56a06d2aed991f7", "ca60f9527111263bca48622331f2b3f6", 
"bbae6ed124f4d6b48435a964a95c8418", "18eb929cf9350e0af65cf863c2786858", 
"af2150fef76b295092cd60a788cbe500", "515ba54f1a6b7609c510b773424470b8", 
"8f6e2a91e20994c00566a5ff2b49506e", "b5051d00e8594db4e68c17a5e131bc31", 
"e249404143b0d737a1eff5b145d87887", "f330a8d2204597dbe97585c4185570bd", 
"dee047362ffa4fcb44fd9c85d7f89694", "8878a0465f5d22d1c56917fe650a7df3", 
"ce0837f50ab0a7abc73bddc82e8b55e6", "6aee73dafee612f6712c469d7d9474f0", 
"fb7a1c0b3625f00ee42dbfcbaa001f12", "719852b30b6cf6748f52cf99407e1e35", 
"6251bd9ebf43fae466939ab366f6e547", "7440aa816171ce9e81d9b67d2dda61c6", 
"8e440529127978044b95fd8316250d04", "47f3d645d96038371757074de1d8fb8d", 
"a1b97046d3d7d7b1e7f0ffdb5c1eb030", "202938f6ff9392b6e167b20b956d9ed1", 
"49af69cbdd5a8fdb1f5153c116782c2f", "c603f40cc192dc15c82385e25b89092a", 
"a5815c42682dc2c92b97590e477e4a53", "51d884cb2660dd8b33068678bad40357", 
"e9398bc0ad9626bc4742c93fd0363bac", "d7828929c9af50b7afc697648d641e4f", 
"7b6d335d29fb8edf989ba7d03a848d84", "a4cd6152db4ba6371bd3d35fcda62a19", 
"0eebf22536faa70d10968eb4972b2736", "16684adccf4a4d35facc56a79870aaa9", 
"bd906cc59babe40542fc681b78f2715f", "d6c964d7a1d76f1547e809eb9b3b8dd6", 
"26672648de2181a2c04ca33e88a1afd1", "4adcf31ec6f620d560355d72574bf86c", 
"e78652a2027cf276c50e6f18ba43a181", "f2cbb29998f80ea0d81f9cde98ee136e", 
"f2cb7b9ac73f1e1a65b2c45a06611f10", "1875d4e7a11ed24c92a865355693b86c", 
"38b7580c13cf081c9f279006c2dc70b9", "222e21f6500d5e0aa66f371387a91193", 
"2f4dc796a1a0c8df6c393cfc2b60ddfc", "2ee657e8d73105fdb87313a047117230", 
"3fc4e1ba3ec8c5224060431bff321530", "224d266b147588a000917723c0268775", 
"41388070b02a7c162aa7245ce4812ebd", "ae2eeeb87431ae4ef2cd38c1865cb52e", 
"98fcc7e4135d41152d2db711de5c1255", "26547c96837ae7efcfa67fca0cc71140", 
"78c5607a490288b6e534cde0066c2961", "c29919216bd6a6713038fd18691cc505", 
"1ef24a460c9407f352a964d2a211914e", "070175299e62656222c1900a4edcc5ee", 
"146cc6a75126c90c5897210af7597f9e", "307c301fcd8c4f7b1bc1d69a06460119", 
"05d429713db038fa66aa06f0a08f84d9", "f3dc239b6d6c09c017eecdf80ee003d6", 
"53f976e60f93b527ffa62fbe48254d53", "771a457379c30fa1d8273ea52d5f53ea", 
"c016d7dfa3b6d4108112ec0a5fc2b5c5", "e62dda278c2f619474817c1bc249bb6f", 
"57fd939c36031a563cb409cba964766d", "63719d66121cd4112b5d8b5d1bacd4b0", 
"409f711b59152d57926cf444c5577087", "aa50bd8494315c2a395044275e4f3609", 
"11902be998c8092e98cbe98af2f648af", "572be56789b840dc21f0dfad380396b3", 
"fb6956ecda12a8dc652b1866dea12301", "5e044f34a0a5ffd168ee1f5855fb99ad", 
"6343c15ef6a0b28bb8d019ebbcd0a55a", "0dcea0611434e0a406939963d1a5ce06", 
"632f712701a6269e41c3ee600d0f3791", "42513a791253a4ede5c83cfd21e37217", 
"b6a1cfc87c3ffa7f96227d4a372d8dd7", "b74ab6944f5a7e98a99e29bcbea455c0", 
"97bca68b08b9ff7077ed2b5e4aa6b360", "697db8c36a87d693ac70c5c1e91d090f", 
"b6635d67cb594473ddba9f8cfba5d13d", "d1264035d98f18b5393cf465af3d602c", 
"9243bc95f3bb4eba8fabc522fa335a7f", "35fd41579b5eec27ec14c400c586c480", 
"7cd4efa29b942711d9a324e442bd9268"), taxon = c("k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__", 
"k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacteriales; f__Enterobacteriaceae", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Prevotellaceae; g__Prevotella; s__", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__Faecalibacterium; s__", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Prevotellaceae; g__Prevotella", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Peptostreptococcaceae; g__Clostridium; s__ruminantium", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__Faecalibacterium; s__", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Roseburia; s__inulinivorans", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Rikenellaceae; g__Alistipes; s__finegoldii", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__Faecalibacterium; s__", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Clostridium; s__fimetarium", 
"k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Bifidobacteriales; f__Bifidobacteriaceae; g__Bifidobacterium", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Roseburia; s__inulinivorans", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Clostridium; s__fimetarium", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides", 
"k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__Oxalobacteraceae", 
"k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacteriales; f__Enterobacteriaceae", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Rikenellaceae; g__Alistipes; s__putredinis", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__Faecalibacterium; s__", 
"k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Pasteurellales; f__Pasteurellaceae", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Ruminococcus; s__lactaris", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Ruminococcus; s__lactaris", 
"k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Bifidobacteriales; f__Bifidobacteriaceae; g__Bifidobacterium; s__breve", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Prevotellaceae; g__Prevotella; s__", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__", 
"k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Pasteurellales; f__Pasteurellaceae", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Clostridiaceae; g__Clostridium; s__paraputrificum", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Clostridiaceae; g__Clostridium; s__neonatale", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Succinispira; s__mobilis", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Prevotellaceae; g__Prevotella; s__", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Prevotellaceae; g__Prevotella", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Ruminococcus; s__lactaris", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Prevotellaceae; g__Prevotella; s__copri", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Prevotellaceae; g__Prevotella; s__", 
"k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacteriales; f__Enterobacteriaceae; g__Proteus; s__myxofaciens", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Lactonifactor; s__longoviformis", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Prevotellaceae; g__Prevotella; s__", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Roseburia; s__inulinivorans", 
"k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; f__Streptococcaceae; g__Streptococcus; s__agalactiae", 
"k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales", 
"k__Bacteria; p__Firmicutes; c__Erysipelotrichi; o__Erysipelotrichales; f__Erysipelotrichaceae; g__Clostridium", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Prevotellaceae; g__Prevotella; s__", 
"k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacteriales; f__Enterobacteriaceae", 
"k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacteriales; f__Enterobacteriaceae; g__Proteus; s__myxofaciens", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Clostridiaceae; g__Clostridium", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae; g__Clostridium; s__methylpentosum", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Clostridiaceae; g__Clostridium; s__disporicum", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__[Barnesiellaceae]; g__Barnesiella; s__intestinihominis", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae", 
"k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Bifidobacteriales; f__Bifidobacteriaceae; g__Bifidobacterium; s__breve", 
"k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__Oxalobacteraceae", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Clostridium", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Defluviitalea; s__saccharophila", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Rikenellaceae; g__Alistipes; s__onderdonkii", 
"k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Bifidobacteriales; f__Bifidobacteriaceae; g__Bifidobacterium; s__breve", 
"k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacteriales; f__Enterobacteriaceae", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Selenomonas; s__bovis", 
"k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Bifidobacteriales; f__Bifidobacteriaceae; g__Bifidobacterium; s__bifidum", 
"k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacteriales; f__Enterobacteriaceae", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Roseburia; s__inulinivorans", 
"k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacteriales; f__Enterobacteriaceae; g__Moellerella; s__wisconsensis", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae", 
"k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Pasteurellales; f__Pasteurellaceae", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__[Barnesiellaceae]; g__Barnesiella; s__intestinihominis", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Prevotellaceae; g__Prevotella; s__", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae", 
"k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__Oxalobacteraceae", 
"k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales", "k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; f__Streptococcaceae; g__Streptococcus; s__agalactiae", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Dorea; s__longicatena", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Clostridium", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__[Barnesiellaceae]; g__Barnesiella; s__intestinihominis", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__[Barnesiellaceae]; g__Barnesiella; s__intestinihominis", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Prevotellaceae; g__Prevotella; s__", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__eggerthii", 
"k__Bacteria; p__Actinobacteria; c__Coriobacteriia; o__Coriobacteriales; f__Coriobacteriaceae; g__Olsenella; s__uli", 
"k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Ruminococcaceae", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__", 
"k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Pasteurellales; f__Pasteurellaceae", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Prevotellaceae; g__Prevotella; s__", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Prevotellaceae; g__Prevotella; s__", 
"k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__", 
"k__Bacteria; p__Firmicutes", "k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Clostridium; s__bolteae"
)), class = "data.frame", row.names = c(NA, -100L))
r stringr
1个回答
0
投票

我能想到的唯一方法是使用嵌套 for 循环。这是因为某些行(例如第二行)中有多个 NA。在这种情况下,除非您希望第二个 NA 为“unclassified_genus(NA)”,否则您需要按顺序更新值:

df <- Taxo_sliced %>%
  mutate(taxon=str_replace_all(string=taxon, pattern=" \\w__|; \\w__$", replacement="")) %>%
  separate_wider_delim(taxon, names = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"), delim = ";", too_few = "align_start")

for (row in seq_along(df$Kingdom)) {
    for (col in seq_along(df)) {
        if (is.na(df[row, col])) {
        df[row, col] <- paste0("unclassified_", tolower(colnames(df)[col-1]), "(", tolower(df[row, col-1]), ")")
        }
    }
}
© www.soinside.com 2019 - 2024. All rights reserved.