使用折线图的熔函数在R中布置和映射数据框

问题描述 投票:0回答:1

我有一个数据框,有4个时间点(行)和18个基因(列),并且在其中一个行中,有一个ID与每个基因相关联,这有助于在ggplot2中绘制线图。我导入数据,然后使用melt()函数将数据从宽格式转换为长格式。我观察到的是ID在数据帧的中间中断。我希望将它们安排在最后一列,这在使用ggplot2库进行线图绘制期间会很有用。请协助我。

谢谢,

Toufiq

数据导入

B1_Test <- read.csv(file ="./B1_Test.csv", stringsAsFactors = FALSE)

dput(head(B1_Test))
structure(list(Timepoints = c("1", "2", "3", "5", "ID"), Gene_A = c("-2.05066", 
"-0.657222", "-1.49477", "-1.80191", "A1.1"), Gene_B = c("-8.35787", 
"-9.52402", "-10.6604", "-10.516", "A1.2"), Gene_C = c("-2.06287", 
"-0.846725", "-1.63796", "-1.31922", "A1.3"), Gene_D = c("-3.83545", 
"-1.19723", "-1.53115", "-3.25903", "A1.4"), Gene_E = c("-6.59039", 
"-5.98822", "-6.23785", "-5.00584", "A1.5"), Gene_F = c("-5.02469", 
"-4.41637", "-5.46219", "-3.97594", "A1.1"), Gene_G = c("-7.75424", 
"-8.17158", "-7.90569", "-8.01352", "A1.6"), Gene_H = c("-4.65703", 
"-3.42328", "-4.08867", "-3.76642", "A1.2"), Gene_I = c("-11.7749", 
"-11.649", "-11.3751", "-10.3728", "A1.3"), Gene_K = c("-4.08981", 
"-3.09873", "-3.95986", "-3.97249", "A1.4"), Gene_L = c(NA, "-19.7923", 
NA, "-15.1216", "A1.5"), Gene_M = c("-4.11469", "-3.19647", "-3.99615", 
"-3.06183", "A1.6"), Gene_N = c("-6.53017", "-6.16685", "-6.865", 
"-6.44303", "A1.9"), Gene_O = c("-4.58034", "-3.45153", "-4.86697", 
"-5.25414", "A2.2"), Gene_P = c("-3.45614", "-2.72413", "-2.75492", 
"-2.76479", "A2.2"), Gene_R = c("-5.24809", "-4.15782", "-5.28192", 
"-5.72024", "A2.6"), Gene_S = c("-7.73098", "-7.20226", "-8.04388", 
"-7.68191", "A2.6"), Gene_T = c("-5.09079", "-4.52039", "-4.75427", 
"-5.4321", "A1.9")), row.names = c(NA, 5L), class = "data.frame")

使用melt()将数据从宽格式转换为lon格式

require(reshape2)
B1_Test_melt <- melt(B1_Test ,  id.vars = 'Timepoints', variable.name = 'Genes')

dput((B1_Test_melt))
structure(list(Timepoints = c("1", "2", "3", "5", "ID", "1", 
"2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", 
"5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", 
"1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", 
"3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", 
"ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", 
"2", "3", "5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", 
"5", "ID", "1", "2", "3", "5", "ID", "1", "2", "3", "5", "ID"
), Genes = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 6L, 
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 
9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 11L, 
12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L, 13L, 13L, 14L, 14L, 14L, 
14L, 14L, 15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L, 16L, 17L, 
17L, 17L, 17L, 17L, 18L, 18L, 18L, 18L, 18L), .Label = c("Gene_A", 
"Gene_B", "Gene_C", "Gene_D", "Gene_E", "Gene_F", "Gene_G", "Gene_H", 
"Gene_I", "Gene_K", "Gene_L", "Gene_M", "Gene_N", "Gene_O", "Gene_P", 
"Gene_R", "Gene_S", "Gene_T"), class = "factor"), value = c("-2.05066", 
"-0.657222", "-1.49477", "-1.80191", "A1.1", "-8.35787", "-9.52402", 
"-10.6604", "-10.516", "A1.2", "-2.06287", "-0.846725", "-1.63796", 
"-1.31922", "A1.3", "-3.83545", "-1.19723", "-1.53115", "-3.25903", 
"A1.4", "-6.59039", "-5.98822", "-6.23785", "-5.00584", "A1.5", 
"-5.02469", "-4.41637", "-5.46219", "-3.97594", "A1.1", "-7.75424", 
"-8.17158", "-7.90569", "-8.01352", "A1.6", "-4.65703", "-3.42328", 
"-4.08867", "-3.76642", "A1.2", "-11.7749", "-11.649", "-11.3751", 
"-10.3728", "A1.3", "-4.08981", "-3.09873", "-3.95986", "-3.97249", 
"A1.4", NA, "-19.7923", NA, "-15.1216", "A1.5", "-4.11469", "-3.19647", 
"-3.99615", "-3.06183", "A1.6", "-6.53017", "-6.16685", "-6.865", 
"-6.44303", "A1.9", "-4.58034", "-3.45153", "-4.86697", "-5.25414", 
"A2.2", "-3.45614", "-2.72413", "-2.75492", "-2.76479", "A2.2", 
"-5.24809", "-4.15782", "-5.28192", "-5.72024", "A2.6", "-7.73098", 
"-7.20226", "-8.04388", "-7.68191", "A2.6", "-5.09079", "-4.52039", 
"-4.75427", "-5.4321", "A1.9")), row.names = c(NA, -90L), class = "data.frame")

预期输出

dput((B1_Test_v1))
structure(list(Timepoints = c(1L, 2L, 3L, 5L, 1L, 2L, 3L, 5L, 
1L, 2L, 3L, 5L, 1L, 2L, 3L, 5L, 1L, 2L, 3L, 5L, 1L, 2L, 3L, 5L
), Genes = c("Gene_A", "Gene_A", "Gene_A", "Gene_A", "Gene_B", 
"Gene_B", "Gene_B", "Gene_B", "Gene_C", "Gene_C", "Gene_C", "Gene_C", 
"Gene_D", "Gene_D", "Gene_D", "Gene_D", "Gene_E", "Gene_E", "Gene_E", 
"Gene_E", "Gene_F", "Gene_F", "Gene_F", "Gene_F"), value = c(-2.05066, 
-0.657222, -1.49477, -1.80191, -8.35787, -9.52402, -10.6604, 
-10.516, -2.06287, -0.846725, -1.63796, -1.31922, -3.83545, -1.19723, 
-1.53115, -3.25903, -6.59039, -5.98822, -6.23785, -5.00584, -5.02469, 
-4.41637, -5.46219, -3.97594), ID = c("A1.1", "A1.1", "A1.1", 
"A1.1", "A1.2", "A1.2", "A1.2", "A1.2", "A1.3", "A1.3", "A1.3", 
"A1.3", "A1.4", "A1.4", "A1.4", "A1.4", "A1.5", "A1.5", "A1.5", 
"A1.5", "A1.1", "A1.1", "A1.1", "A1.1")), class = "data.frame", row.names = c(NA, 
-24L))
r dataframe ggplot2 melt
1个回答
0
投票

该数据在源文档中的格式不正确:数据中包含ID行正在将所有数字列损坏为字符串。您首先应该说服给您该数据文件的人,以提供一个合理的数据集(我认为混合类的列不合理)。

缺少那个,删除ID行,对其进行整形,然后将该数据合并回到经过整形的剩余数据中。

B1_IDs <- melt(B1_Test[ B1_Test$Timepoints == "ID", ], id.vars = 'Timepoints', variable.name = 'Genes', value.name = 'ID')[, c("Genes", "ID")]
head(B1_IDs)
#    Genes   ID
# 1 Gene_A A1.1
# 2 Gene_B A1.2
# 3 Gene_C A1.3
# 4 Gene_D A1.4
# 5 Gene_E A1.5
# 6 Gene_F A1.1

现在对非ID行进行重塑:

B1_Test_melt <- melt(B1_Test[B1_Test$Timepoints != "ID", ] ,  id.vars = 'Timepoints', variable.name = 'Genes')
B1_Test_melt
# *** output flushed ***
head(B1_Test_melt)
#   Timepoints  Genes     value
# 1          1 Gene_A  -2.05066
# 2          2 Gene_A -0.657222
# 3          3 Gene_A  -1.49477
# 4          5 Gene_A  -1.80191
# 5          1 Gene_B  -8.35787
# 6          2 Gene_B  -9.52402

并将两者合并:

B1_merged <- merge(B1_Test_melt, B1_IDs, by = "Genes", all = TRUE)
head(B1_merged)
#    Genes Timepoints     value   ID
# 1 Gene_A          1  -2.05066 A1.1
# 2 Gene_A          2 -0.657222 A1.1
# 3 Gene_A          3  -1.49477 A1.1
# 4 Gene_A          5  -1.80191 A1.1
# 5 Gene_B          1  -8.35787 A1.2
# 6 Gene_B          2  -9.52402 A1.2

((除非我丢失了某些内容,您可能还想做B1_merged$value <- as.numeric(B1_merged$value)。另外请注意,Genesfactor,可根据需要使用as.character进行固定。)


0
投票

要做的第一件事是将ID与数据分开:

Gene_ID <- data.frame( t( B1_Test[5,-1]))
> Gene_ID
         X5
Gene_A A1.1
Gene_B A1.2
Gene_C A1.3
Gene_D A1.4
Gene_E A1.5
snip....

然后融化非ID行:

> Gene_vals <- melt( B1_Test[-5,],  id.vars = 'Timepoints', variable.name = 'Genes')
> head(Gene_vals)
  Timepoints  Genes     value
1          1 Gene_A  -2.05066
2          2 Gene_A -0.657222
3          3 Gene_A  -1.49477
4          5 Gene_A  -1.80191
5          1 Gene_B  -8.35787
6          2 Gene_B  -9.52402
> str(Gene_vals)
'data.frame':   72 obs. of  3 variables:
 $ Timepoints: chr  "1" "2" "3" "5" ...
 $ Genes     : Factor w/ 18 levels "Gene_A","Gene_B",..: 1 1 1 1 2 2 2 2 3 3 ...
 $ value     : chr  "-2.05066" "-0.657222" "-1.49477" "-1.80191" ...
> Gene_vals$value <- as.numeric(Gene_vals$value)
> str(Gene_vals)
'data.frame':   72 obs. of  3 variables:
 $ Timepoints: chr  "1" "2" "3" "5" ...
 $ Genes     : Factor w/ 18 levels "Gene_A","Gene_B",..: 1 1 1 1 2 2 2 2 3 3 ...
 $ value     : num  -2.051 -0.657 -1.495 -1.802 -8.358 ...

并合并它们:

> final <- merge(Gene_vals, Gene_ID, by.x="Genes", by.y="row.names")
> head(final)
   Genes Timepoints     value   X5
1 Gene_A          1 -2.050660 A1.1
2 Gene_A          2 -0.657222 A1.1
3 Gene_A          3 -1.494770 A1.1
4 Gene_A          5 -1.801910 A1.1
5 Gene_B          1 -8.357870 A1.2
6 Gene_B          2 -9.524020 A1.2
© www.soinside.com 2019 - 2024. All rights reserved.