我尝试根据结构符号类型列中的值将结构符号列的值移动到名为 SMILES 或 InChI 的新列。本质上,我希望结构符号列中的所有 SMILES 符号都位于 SMILES 列中,而结构符号列中的所有 InCHI 符号都位于 InCHI 列中。
为了得出该结论,我创建了组合列,其中组合了结构符号类型列和结构符号列的内容,并分别为 SMILES 和 INCHI 行值添加“ && ”或“ %% ” .
library(data.table)
structures <- structure(list(CAS = c("70024-85-0", "80934-44-7", "356-86-5",
"70024-86-1", "79004-87-8", "63335-88-6", "70024-79-2", "28994-41-4",
"63021-86-3", "63041-90-7"), `Structural Notation Type` = c("SMILES",
"SMILES", "SMILES", "SMILES", "SMILES", "INCHI", "SMILES", "INCHI",
"INCHI", "INCHI"), `Structural Notation` = c("", "", "O=C(OCC(F)(F)C(F)(F)F)C=C",
"", "", "1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3", "",
"1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2",
"1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H",
"1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H"
), Combine = c("SMILES && ", "SMILES && ", "SMILES && O=C(OCC(F)(F)C(F)(F)F)C=C",
"SMILES && ", "SMILES && ", "INCHI %% 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3",
"SMILES && ", "INCHI %% 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2",
"INCHI %% 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H",
"INCHI %% 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H"
)), row.names = c(NA, -10L), class = c("data.table", "data.frame"
))
渲染
CAS Structural Notation Type
1: 70024-85-0 SMILES
2: 80934-44-7 SMILES
3: 356-86-5 SMILES
4: 70024-86-1 SMILES
5: 79004-87-8 SMILES
6: 63335-88-6 INCHI
7: 70024-79-2 SMILES
8: 28994-41-4 INCHI
9: 63021-86-3 INCHI
10: 63041-90-7 INCHI
Structural Notation
1:
2:
3: O=C(OCC(F)(F)C(F)(F)F)C=C
4:
5:
6: 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
7:
8: 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
9: 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H
Combine
1: SMILES &&
2: SMILES &&
3: SMILES && O=C(OCC(F)(F)C(F)(F)F)C=C
4: SMILES &&
5: SMILES &&
6: INCHI %% 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
7: SMILES &&
8: INCHI %% 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
9: INCHI %% 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: INCHI %% 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H
在第二步中,我使用了
data.table
中的tstrsplit分割函数。你会发现最终的结果并不正确。例如,您将在第 3 行中看到本应位于 SMILES 列中的 SMILES 行,但实际上位于 InCHI 列中。
structures[, c("SMILES", "InChI") := tstrsplit(Combine, c("&&", "%%"), fixed = TRUE)][]
structures <- structure(list(CAS = c("70024-85-0", "80934-44-7", "356-86-5",
"70024-86-1", "79004-87-8", "63335-88-6", "70024-79-2", "28994-41-4",
"63021-86-3", "63041-90-7"), `Structural Notation Type` = c("SMILES",
"SMILES", "SMILES", "SMILES", "SMILES", "INCHI", "SMILES", "INCHI",
"INCHI", "INCHI"), `Structural Notation` = c("", "", "O=C(OCC(F)(F)C(F)(F)F)C=C",
"", "", "1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3", "",
"1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2",
"1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H",
"1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H"
), Combine = c("SMILES && ", "SMILES && ", "SMILES && O=C(OCC(F)(F)C(F)(F)F)C=C",
"SMILES && ", "SMILES && ", "INCHI %% 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3",
"SMILES && ", "INCHI %% 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2",
"INCHI %% 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H",
"INCHI %% 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H"
), SMILES = c("SMILES ", "SMILES && ", "SMILES ", "SMILES && ",
"SMILES ", "INCHI ", "SMILES ", "INCHI ", "INCHI %% 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H",
"INCHI "), InChI = c(" ", NA, " O=C(OCC(F)(F)C(F)(F)F)C=C", NA,
" ", " 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3", " ",
" 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2",
NA, " 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H"
)), row.names = c(NA, -10L), class = c("data.table", "data.frame"
))
渲染
CAS Structural Notation Type
1: 70024-85-0 SMILES
2: 80934-44-7 SMILES
3: 356-86-5 SMILES
4: 70024-86-1 SMILES
5: 79004-87-8 SMILES
6: 63335-88-6 INCHI
7: 70024-79-2 SMILES
8: 28994-41-4 INCHI
9: 63021-86-3 INCHI
10: 63041-90-7 INCHI
Structural Notation
1:
2:
3: O=C(OCC(F)(F)C(F)(F)F)C=C
4:
5:
6: 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
7:
8: 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
9: 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H
Combine
1: SMILES &&
2: SMILES &&
3: SMILES && O=C(OCC(F)(F)C(F)(F)F)C=C
4: SMILES &&
5: SMILES &&
6: INCHI %% 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
7: SMILES &&
8: INCHI %% 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
9: INCHI %% 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: INCHI %% 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H
SMILES
1: SMILES
2: SMILES &&
3: SMILES
4: SMILES &&
5: SMILES
6: INCHI
7: SMILES
8: INCHI
9: INCHI %% 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: INCHI
InChI
1:
2: <NA>
3: O=C(OCC(F)(F)C(F)(F)F)C=C
4: <NA>
5:
6: 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
7:
8: 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
9: <NA>
10: 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H
有没有办法避免创建Combine列并将与SMILES匹配的每一行移动到SMILES列中并将与INCHI列匹配的每一行直接移动到InCHI列中?
如果没有,有没有办法避免使用 tstrsplit 时出现的错误?
@Wimpel,谢谢您的回答。我在下面列出了首选解决方案和理想解决方案。
这是首选解决方案
CAS Structural Notation Type
1: 70024-85-0 SMILES
2: 80934-44-7 SMILES
3: 356-86-5 SMILES
4: 70024-86-1 SMILES
5: 79004-87-8 SMILES
6: 63335-88-6 INCHI
7: 70024-79-2 SMILES
8: 28994-41-4 INCHI
9: 63021-86-3 INCHI
10: 63041-90-7 INCHI
Structural Notation
1:
2:
3: O=C(OCC(F)(F)C(F)(F)F)C=C
4:
5:
6: 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
7:
8: 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
9: 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H
Combine
1: SMILES &&
2: SMILES &&
3: SMILES && O=C(OCC(F)(F)C(F)(F)F)C=C
4: SMILES &&
5: SMILES &&
6: INCHI %% 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
7: SMILES &&
8: INCHI %% 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
9: INCHI %% 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: INCHI %% 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H
SMILES
1:
2:
3: O=C(OCC(F)(F)C(F)(F)F)C=C
4:
5:
6:
7:
8:
9:
10:
InChI
1:
2:
3:
4:
5:
6: 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
7:
8: 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
9: 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H
这是理想的解决方案(首先不需要创建组合列)
CAS Structural Notation Type
1: 70024-85-0 SMILES
2: 80934-44-7 SMILES
3: 356-86-5 SMILES
4: 70024-86-1 SMILES
5: 79004-87-8 SMILES
6: 63335-88-6 INCHI
7: 70024-79-2 SMILES
8: 28994-41-4 INCHI
9: 63021-86-3 INCHI
10: 63041-90-7 INCHI
Structural Notation
1:
2:
3: O=C(OCC(F)(F)C(F)(F)F)C=C
4:
5:
6: 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
7:
8: 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
9: 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H
SMILES
1:
2:
3: O=C(OCC(F)(F)C(F)(F)F)C=C
4:
5:
6:
7:
8:
9:
10:
InChI
1:
2:
3:
4:
5:
6: 1S/C10H22/c1-4-5-6-7-8-9-10(2)3/h10H,4-9H2,1-3H3
7:
8: 1S/C13H12O/c14-13-9-5-4-8-12(13)10-11-6-2-1-3-7-11/h1-9,14H,10H2
9: 1S/C16H9NO2/c18-17(19)14-9-7-12-5-4-10-2-1-3-11-6-8-13(14)16(12)15(10)11/h1-9H
10: 1S/C20H11NO2/c22-21(23)20-16-7-2-1-6-14(16)15-10-8-12-4-3-5-13-9-11-17(20)19(15)18(12)13/h1-11H
看起来转换为宽格式应该可行...由于您的示例数据缺乏所需的输出,因此很难检查此解决方案是否适合您..
dcast(structures, CAS ~ `Structural Notation Type`,
value.var = "Structural Notation", fill = "")