我正在 dplyr 中使用 reshape 将长数据表变成宽数据表。但当我这样做时,我似乎失去了一些观察结果。名为“数据”的数据表记录了20年来6种不同鸟类的观察结果。我想通过组合每个位置/日期来使其成为一个广泛的数据表,以在调查期间未观察到每个物种时包含 0。
头(数据) 通用名称 地点.ID 观察.日期 观察.计数 1 博内利斯_鹰 L1210237 12/17/2007 1 2 北方猫头鹰 L11834228 9/3/2020 1 3 猎鹰_猎鹰 L12137171 2021 年 6 月 27 日 1 4 Saker_Falcon L1218263 2004 年 4 月 27 日 1 5 棕色_鱼_猫头鹰 L13864707 2/26/2021 1 6 博内利斯_鹰 L16000115 8/6/2021 2
表(数据$COMMON.NAME,数据$OBSERVATION.COUNT)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 17 20 21 22 26 27 28 31 35 39 40 41 42 50 51 60 62 64 94 100
博内利斯_鹰 137 51 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 北方猫头鹰 18 2 2 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 棕色_鱼_猫头鹰 51 29 11 6 4 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 大鸨 38 13 7 9 5 2 2 6 2 5 1 3 0 2 2 1 3 1 1 2 3 1 1 0 1 1 1 1 1 1 1 1 1 1 0 Lanner_Falcon 56 12 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 Pin_tailed_沙鸡 17 7 2 6 2 4 3 5 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 Saker_Falcon 61 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
我总共应该有 69 只隼隼(61:1、2:2 和 1:4)
然后我使用 LOCALITY.ID 和 OBSERVATION.DATE 重塑数据表,并使用 time.var“COMMON.NAME” 获取所有 6 个物种的宽数据表:
d_sens = reshape(data, idvar=c('LOCALITY.ID','OBSERVATION.DATE'), timevar = 'COMMON.NAME', Direction='wide')
最后,我将 NA 替换为 0
colnames(d_sens) = c('location','date','BE','BO','SF','BFO','GB','PS','LF')
d_sens$BE[is.na(d_sens$BE)] = 0
d_sens$BO[is.na(d_sens$BO)] = 0
d_sens$SF[is.na(d_sens$SF)] = 0
d_sens$BFO[is.na(d_sens$BFO)] = 0
d_sens$GB[is.na(d_sens$GB)] = 0
d_sens$PS[is.na(d_sens$PS)] = 0
d_sens$LF[is.na(d_sens$LF)] = 0
头(d_sens) 地点 日期 BE BO SF BFO GB PS LF 1 L1210237 2007年12月17日 1 0 0 0 0 0 0 2 L11834228 2020年9月3日 0 1 0 0 0 0 0 3 L12137171 2021年6月27日 0 0 1 0 0 0 0 4 L1218263 2004年4月27日 0 0 1 0 0 0 0 5 L13864707 2/26/2021 0 0 0 1 0 0 0 6 L16000115 2021 年 8 月 6 日 2 0 0 0 0 0 0
这就是我想要的数据表类型。
表(d_sens$SF)
0 1 2 4 545 57 2 1
当我比较 Saker Falcon 观测次数(在宽“d_sens”和长“data”文件之间)时,我丢失了 4 次观测到的次数 1。我的“数据”中没有任何缺失值。 任何建议将不胜感激。
我尝试使用 dplyr reshape 命令:
d_sens = reshape(data, idvar=c('LOCALITY.ID','OBSERVATION.DATE'), timevar = 'COMMON.NAME', Direction='wide')
d_sens 看起来像我想要的表类型,但缺少值。
我还尝试了 tidyrivot_wider 函数(在之前的帖子中建议过),但我无法让它工作。
数据%>% hub_wider(names_from=c(LOCALITY.ID,OBSERVATION.DATE),values_from=OBSERVATION.COUNT)
但是我收到这个警告: 警告信息:
OBSERVATION.COUNT
中的值不是唯一标识的;输出将包含列表列。
这可能是原始数据中存在重复项的问题。 例如,对于同一地点、同一天和同一货币:
COMMON.NAME LOCALITY.ID OBSERVATION.DATE OBSERVATION.COUNT
Bonellis_Eagle L1218263 03/01/2023 1
Bonellis_Eagle L1218263 03/01/2023 3
Reshape
旋转时将仅保留第一个元素。Pivot_wider
将抛出错误(“XXX 中的值不是唯一的
已识别”)。另外,当您
称之为。假设“data”是数据框的名称,在旋转之前,您可以使用以下命令删除重复项:
data = data %>%
group_by(LOCALITY.ID, OBSERVATION.DATE, COMMON.NAME) %>%
summarise(OBSERVATION.COUNT = n(),.groups = "drop")
或者你可以直接使用
pivot_wider
来解决这个问题,并以 value_fn=sum
作为参数。
玩具数据集:
data=structure(list(COMMON.NAME = c("Bonellis_Eagle", "Lanner_Falcon",
"Saker_Falcon", "Great_Bustard", "Brown_Fish_Owl", "Pin_tailed_Sandgrouse",
"Boreal_Owl", "Brown_Fish_Owl", "Lanner_Falcon", "Lanner_Falcon",
"Brown_Fish_Owl", "Great_Bustard", "Saker_Falcon", "Saker_Falcon",
"Brown_Fish_Owl", "Boreal_Owl", "Lanner_Falcon", "Bonellis_Eagle",
"Lanner_Falcon", "Saker_Falcon", "Saker_Falcon", "Brown_Fish_Owl",
"Bonellis_Eagle", "Pin_tailed_Sandgrouse", "Great_Bustard", "Pin_tailed_Sandgrouse",
"Saker_Falcon", "Great_Bustard", "Boreal_Owl", "Brown_Fish_Owl",
"Lanner_Falcon", "Bonellis_Eagle", "Bonellis_Eagle", "Great_Bustard",
"Lanner_Falcon", "Boreal_Owl", "Bonellis_Eagle", "Lanner_Falcon",
"Brown_Fish_Owl", "Boreal_Owl", "Saker_Falcon", "Boreal_Owl",
"Pin_tailed_Sandgrouse", "Brown_Fish_Owl", "Saker_Falcon", "Bonellis_Eagle",
"Pin_tailed_Sandgrouse", "Boreal_Owl", "Lanner_Falcon", "Lanner_Falcon",
"Lanner_Falcon", "Pin_tailed_Sandgrouse", "Pin_tailed_Sandgrouse",
"Boreal_Owl", "Lanner_Falcon", "Bonellis_Eagle", "Bonellis_Eagle",
"Brown_Fish_Owl", "Pin_tailed_Sandgrouse", "Boreal_Owl", "Lanner_Falcon",
"Saker_Falcon", "Pin_tailed_Sandgrouse", "Saker_Falcon", "Great_Bustard",
"Saker_Falcon", "Boreal_Owl", "Saker_Falcon", "Great_Bustard",
"Saker_Falcon", "Brown_Fish_Owl", "Pin_tailed_Sandgrouse", "Boreal_Owl",
"Lanner_Falcon", "Pin_tailed_Sandgrouse", "Bonellis_Eagle", "Boreal_Owl",
"Great_Bustard", "Saker_Falcon", "Bonellis_Eagle", "Pin_tailed_Sandgrouse",
"Pin_tailed_Sandgrouse"), LOCALITY.ID = c("L1218263", "L11834228",
"L12137171", "L1210237", "L16000115", "L13864707", "L1210237",
"L11834228", "L16000115", "L1210237", "L1210237", "L16000115",
"L16000115", "L13864707", "L16000115", "L12137171", "L13864707",
"L16000115", "L1210237", "L1210237", "L1210237", "L13864707",
"L13864707", "L11834228", "L12137171", "L1218263", "L11834228",
"L13864707", "L13864707", "L1210237", "L13864707", "L1210237",
"L12137171", "L1210237", "L1210237", "L1210237", "L16000115",
"L1210237", "L11834228", "L1218263", "L1218263", "L1210237",
"L13864707", "L12137171", "L13864707", "L11834228", "L1218263",
"L1210237", "L16000115", "L1218263", "L13864707", "L1210237",
"L1218263", "L13864707", "L16000115", "L1210237", "L11834228",
"L12137171", "L16000115", "L16000115", "L13864707", "L12137171",
"L12137171", "L13864707", "L13864707", "L16000115", "L11834228",
"L1210237", "L1210237", "L1210237", "L11834228", "L12137171",
"L12137171", "L16000115", "L13864707", "L12137171", "L11834228",
"L1218263", "L11834228", "L12137171", "L11834228", "L11834228"
), OBSERVATION.DATE = c("03/01/2023", "06/01/2023", "05/01/2023",
"06/01/2023", "07/01/2023", "06/01/2023", "06/01/2023", "06/01/2023",
"03/01/2023", "07/01/2023", "01/01/2023", "06/01/2023", "02/01/2023",
"02/01/2023", "03/01/2023", "01/01/2023", "02/01/2023", "05/01/2023",
"06/01/2023", "03/01/2023", "02/01/2023", "07/01/2023", "02/01/2023",
"03/01/2023", "06/01/2023", "01/01/2023", "02/01/2023", "01/01/2023",
"02/01/2023", "06/01/2023", "05/01/2023", "06/01/2023", "03/01/2023",
"03/01/2023", "04/01/2023", "01/01/2023", "04/01/2023", "03/01/2023",
"04/01/2023", "04/01/2023", "07/01/2023", "07/01/2023", "03/01/2023",
"06/01/2023", "01/01/2023", "01/01/2023", "02/01/2023", "05/01/2023",
"06/01/2023", "04/01/2023", "01/01/2023", "01/01/2023", "03/01/2023",
"03/01/2023", "02/01/2023", "02/01/2023", "06/01/2023", "07/01/2023",
"01/01/2023", "04/01/2023", "07/01/2023", "04/01/2023", "05/01/2023",
"04/01/2023", "05/01/2023", "01/01/2023", "04/01/2023", "05/01/2023",
"05/01/2023", "06/01/2023", "02/01/2023", "04/01/2023", "06/01/2023",
"04/01/2023", "02/01/2023", "05/01/2023", "02/01/2023", "01/01/2023",
"05/01/2023", "04/01/2023", "06/01/2023", "01/01/2023"), OBSERVATION.COUNT = c(1,
5, 2, 2, 4, 2, 5, 1, 1, 2, 0, 1, 1, 5, 3, 3, 1, 3, 1, 1, 0, 3,
0, 0, 3, 5, 5, 5, 4, 2, 5, 4, 1, 4, 5, 1, 3, 4, 5, 0, 0, 0, 4,
2, 5, 0, 2, 2, 0, 1, 0, 5, 3, 3, 3, 2, 0, 4, 4, 2, 1, 5, 0, 2,
4, 1, 4, 2, 2, 3, 0, 5, 1, 4, 3, 2, 1, 3, 4, 0, 1, 2)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -82L))
旋转(+设置列名称、删除 NA、对 df 进行排序)。
library(dplyr)
library(tidyr)
output = data %>%
pivot_wider(names_from = 'COMMON.NAME',values_from = 'OBSERVATION.COUNT',values_fn = sum) %>%
setNames(c('location','date','BE','BO','SF','BFO','GB','PS','LF')) %>%
mutate(across(everything(),~replace_na(.x,0))) %>%
arrange(location,date)
输出:
# A tibble: 38 × 9
location date BE BO SF BFO GB PS LF
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 L11834228 01/01/2023 0 0 0 0 0 2 0
2 L11834228 02/01/2023 0 0 5 0 0 0 1
3 L11834228 03/01/2023 0 0 0 0 0 0 0
4 L11834228 04/01/2023 0 0 0 0 5 0 4
5 L11834228 05/01/2023 0 0 4 0 0 0 0
6 L11834228 06/01/2023 0 5 0 0 1 1 0
7 L1210237 01/01/2023 0 0 0 0 0 5 1
8 L1210237 02/01/2023 2 0 0 0 0 0 0
9 L1210237 03/01/2023 0 4 1 4 0 0 0
10 L1210237 04/01/2023 0 5 0 0 0 0 0
# ℹ 28 more rows