r - data.table - 根据前一列中的行值分割行(2 个模式) - 使用 tstrsplit 的值不正确

问题描述 投票:0回答:1

我尝试根据结构符号类型列中的值将结构符号列的值移动到名为 SMILES 或 InChI 的新列。本质上,我希望结构符号列中的所有 SMILES 符号都位于 SMILES 列中,而结构符号列中的所有 InCHI 符号都位于 InCHI 列中。

为了得出该结论,我创建了组合列,其中组合了结构符号类型列和结构符号列的内容,并分别为 SMILES 和 INCHI 行值添加“ && ”或“ %% ” .


library(data.table)

structures <- structure(list(CAS = c("70024-85-0", "80934-44-7", "356-86-5",
"70024-86-1", "79004-87-8", "63335-88-6", "70024-79-2", "28994-41-4",
"63021-86-3", "63041-90-7"), `Structural Notation Type` = c("SMILES",
"SMILES", "SMILES", "SMILES", "SMILES", "INCHI", "SMILES", "INCHI",
"INCHI", "INCHI"), `Structural Notation` = c("", "", "O=C(OCC(F)(F)C(F)(F)F)C=C",
"", "", "1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3", "",
"1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2",
"1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H",
"1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H"
), Combine = c("SMILES && ", "SMILES && ", "SMILES && O=C(OCC(F)(F)C(F)(F)F)C=C",
"SMILES && ", "SMILES && ", "INCHI %% 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3",
"SMILES && ", "INCHI %% 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2",
"INCHI %% 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H",
"INCHI %% 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H"
)), row.names = c(NA, -10L), class = c("data.table", "data.frame"
))

渲染

         CAS Structural Notation Type
 1: 70024-85-0                   SMILES
 2: 80934-44-7                   SMILES
 3:   356-86-5                   SMILES
 4: 70024-86-1                   SMILES
 5: 79004-87-8                   SMILES
 6: 63335-88-6                    INCHI
 7: 70024-79-2                   SMILES
 8: 28994-41-4                    INCHI
 9: 63021-86-3                    INCHI
10: 63041-90-7                    INCHI
                                                                                Structural Notation
 1:                                                                                                
 2:                                                                                                
 3:                                                                       O=C(OCC(F)(F)C(F)(F)F)C=C
 4:                                                                                                
 5:                                                                                                
 6:                                                1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
 7:                                                                                                
 8:                                1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
 9:                  1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H
                                                                                                     Combine
 1:                                                                                               SMILES && 
 2:                                                                                               SMILES && 
 3:                                                                      SMILES && O=C(OCC(F)(F)C(F)(F)F)C=C
 4:                                                                                               SMILES && 
 5:                                                                                               SMILES && 
 6:                                                INCHI %% 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
 7:                                                                                               SMILES && 
 8:                                INCHI %% 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
 9:                  INCHI %% 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: INCHI %% 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H

在第二步中,我使用了

data.table
中的tstrsplit分割函数。你会发现最终的结果并不正确。例如,您将在第 3 行中看到本应位于 SMILES 列中的 SMILES 行,但实际上位于 InCHI 列中。

structures[, c("SMILES", "InChI") := tstrsplit(Combine, c("&&", "%%"), fixed = TRUE)][]

structures <- structure(list(CAS = c("70024-85-0", "80934-44-7", "356-86-5",
"70024-86-1", "79004-87-8", "63335-88-6", "70024-79-2", "28994-41-4",
"63021-86-3", "63041-90-7"), `Structural Notation Type` = c("SMILES",
"SMILES", "SMILES", "SMILES", "SMILES", "INCHI", "SMILES", "INCHI",
"INCHI", "INCHI"), `Structural Notation` = c("", "", "O=C(OCC(F)(F)C(F)(F)F)C=C",
"", "", "1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3", "",
"1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2",
"1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H",
"1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H"
), Combine = c("SMILES && ", "SMILES && ", "SMILES && O=C(OCC(F)(F)C(F)(F)F)C=C",
"SMILES && ", "SMILES && ", "INCHI %% 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3",
"SMILES && ", "INCHI %% 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2",
"INCHI %% 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H",
"INCHI %% 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H"
), SMILES = c("SMILES ", "SMILES && ", "SMILES ", "SMILES && ",
"SMILES ", "INCHI ", "SMILES ", "INCHI ", "INCHI %% 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H",
"INCHI "), InChI = c(" ", NA, " O=C(OCC(F)(F)C(F)(F)F)C=C", NA,
" ", " 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3", " ",
" 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2",
NA, " 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H"
)), row.names = c(NA, -10L), class = c("data.table", "data.frame"
))


渲染

          CAS Structural Notation Type
 1: 70024-85-0                   SMILES
 2: 80934-44-7                   SMILES
 3:   356-86-5                   SMILES
 4: 70024-86-1                   SMILES
 5: 79004-87-8                   SMILES
 6: 63335-88-6                    INCHI
 7: 70024-79-2                   SMILES
 8: 28994-41-4                    INCHI
 9: 63021-86-3                    INCHI
10: 63041-90-7                    INCHI
                                                                                Structural Notation
 1:                                                                                                
 2:                                                                                                
 3:                                                                       O=C(OCC(F)(F)C(F)(F)F)C=C
 4:                                                                                                
 5:                                                                                                
 6:                                                1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
 7:                                                                                                
 8:                                1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
 9:                  1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H
                                                                                                     Combine
 1:                                                                                               SMILES && 
 2:                                                                                               SMILES && 
 3:                                                                      SMILES && O=C(OCC(F)(F)C(F)(F)F)C=C
 4:                                                                                               SMILES && 
 5:                                                                                               SMILES && 
 6:                                                INCHI %% 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
 7:                                                                                               SMILES && 
 8:                                INCHI %% 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
 9:                  INCHI %% 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: INCHI %% 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H
                                                                                     SMILES
 1:                                                                                 SMILES 
 2:                                                                              SMILES && 
 3:                                                                                 SMILES
 4:                                                                              SMILES &&
 5:                                                                                 SMILES
 6:                                                                                  INCHI
 7:                                                                                 SMILES
 8:                                                                                  INCHI
 9: INCHI %% 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10:                                                                                  INCHI
                                                                                               InChI
 1:
 2:                                                                                             <NA>
 3:                                                                        O=C(OCC(F)(F)C(F)(F)F)C=C
 4:                                                                                             <NA>
 5:
 6:                                                 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
 7:
 8:                                 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
 9:                                                                                             <NA>
10:  1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H

有没有办法避免创建Combine列并将与SMILES匹配的每一行移动到SMILES列中并将与INCHI列匹配的每一行直接移动到InCHI列中?

如果没有,有没有办法避免使用 tstrsplit 时出现的错误?

@Wimpel,谢谢您的回答。我在下面列出了首选解决方案和理想解决方案。

这是首选解决方案

           CAS Structural Notation Type
 1: 70024-85-0                   SMILES
 2: 80934-44-7                   SMILES
 3:   356-86-5                   SMILES
 4: 70024-86-1                   SMILES
 5: 79004-87-8                   SMILES
 6: 63335-88-6                    INCHI
 7: 70024-79-2                   SMILES
 8: 28994-41-4                    INCHI
 9: 63021-86-3                    INCHI
10: 63041-90-7                    INCHI
                                                                                Structural Notation
 1:
 2:
 3:                                                                       O=C(OCC(F)(F)C(F)(F)F)C=C
 4:
 5:
 6:                                                1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
 7:
 8:                                1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
 9:                  1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H
                                                                                                     Combine
 1:                                                                                               SMILES &&
 2:                                                                                               SMILES &&
 3:                                                                      SMILES && O=C(OCC(F)(F)C(F)(F)F)C=C
 4:                                                                                               SMILES &&
 5:                                                                                               SMILES &&
 6:                                                INCHI %% 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
 7:                                                                                               SMILES &&
 8:                                INCHI %% 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
 9:                  INCHI %% 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: INCHI %% 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H
     SMILES                                                                                
1: 
2: 
3: O=C(OCC(F)(F)C(F)(F)F)C=C
4: 
5: 
6: 
7: 
8: 
9: 
10: 

                                                                                            
InChI
1: 
2: 
3: 
4: 
5: 
6: 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
7: 
8: 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
9: 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H

这是理想的解决方案(首先不需要创建组合列)


           CAS Structural Notation Type
 1: 70024-85-0                   SMILES
 2: 80934-44-7                   SMILES
 3:   356-86-5                   SMILES
 4: 70024-86-1                   SMILES
 5: 79004-87-8                   SMILES
 6: 63335-88-6                    INCHI
 7: 70024-79-2                   SMILES
 8: 28994-41-4                    INCHI
 9: 63021-86-3                    INCHI
10: 63041-90-7                    INCHI
                                                                                Structural Notation
 1:
 2:
 3:                                                                       O=C(OCC(F)(F)C(F)(F)F)C=C
 4:
 5:
 6:                                                1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
 7:
 8:                                1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
 9:                  1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H

     SMILES                                                                                
1: 
2: 
3: O=C(OCC(F)(F)C(F)(F)F)C=C
4: 
5: 
6: 
7: 
8: 
9: 
10: 

                                                                                            
InChI
1: 
2: 
3: 
4: 
5: 
6: 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
7: 
8: 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
9: 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H
r data.table strsplit
1个回答
1
投票

看起来转换为宽格式应该可行...由于您的示例数据缺乏所需的输出,因此很难检查此解决方案是否适合您..

dcast(structures, CAS ~ `Structural Notation Type`, 
      value.var = "Structural Notation", fill = "")
© www.soinside.com 2019 - 2024. All rights reserved.