在 R 中的时间戳范围内时左连接大型时间序列数据集

问题描述 投票:0回答:3

我需要将两个数据集连接在一起:

  1. 每秒采集一次心率测量的时间序列数据(有 140 万条记录)
>allsecsHR
timestamp               HRbpm
2023-03-22 09:04:53     101
2023-03-22 09:04:54     124
2023-03-22 09:04:55     103
2023-03-22 09:04:56     111
2023-03-22 09:04:57     112
2023-03-22 09:04:58     143
2023-03-22 09:04:59     109
2023-03-22 09:05:00     129
2023-03-22 09:05:01     122
2023-03-22 09:05:02     125
2023-03-22 09:05:03     110
  1. 带有开始和结束时间戳的行为记录
>bhr
Behaviour  BhrTimeStart         BhrTimeEnd
Forage     2023-03-22 09:04:53  2023-03-22 09:04:58
Vigilance  2023-03-22 09:04:58  2023-03-22 09:05:03
Forage     2023-03-22 09:05:03  2023-03-22 09:05:10

最后我想要一个数据集,其中每行代表一秒,以及每秒执行的行为

我尝试在 mySQL 和 R 中使用 sqldf 和 powerjoin 执行此操作,但运行 2 小时后它不断失去与 sql 服务器的连接,或者在 R 中它无法完成并崩溃或返回

Error: vector memory exhausted (limit reached?)
。我真的很感谢一些帮助来找到有效的方法来做到这一点!

我认为 data.table 解决方案可能是最快的,但我不知道如何去做,一个整洁的解决方案也很棒!

到目前为止我已经尝试过:

mySQL/sqldf

library(sqldf)
sqldf("select * from allsecsHR
              left join bhr
              on allsecsHR.timestamp between bhr.BhrTimeStart and bhr.BhrTimeEnd")

强力加入

library(powerjoin)
power_left_join(
  allsecsHR, bhr, 
  by = ~.x$timestamp > .y$BhrTimeStart & 
    (.x$timestamp < .y$BhrTimeEnd | is.na(.y$BhrTimeEnd)),
  keep = "left")

我想要的输出表是这样的:

timestamp           HRbpm   Bhr         BhrTimeStart        BhrTimeEnd
2023-03-22 09:04:53 101     Forage      2023-03-22 09:04:53   2023-03-22 09:04:58
2023-03-22 09:04:54 124     Forage      2023-03-22 09:04:53   2023-03-22 09:04:58
2023-03-22 09:04:55 103     Forage      2023-03-22 09:04:53   2023-03-22 09:04:58
2023-03-22 09:04:56 111     Forage      2023-03-22 09:04:53   2023-03-22 09:04:58
2023-03-22 09:04:57 112     Forage      2023-03-22 09:04:53   2023-03-22 09:04:58
2023-03-22 09:04:58 143     Vigilance   2023-03-22 09:04:58   2023-03-22 09:05:03
2023-03-22 09:04:59 109     Vigilance   2023-03-22 09:04:58   2023-03-22 09:05:03
2023-03-22 09:05:00 129     Vigilance   2023-03-22 09:04:58   2023-03-22 09:05:03
2023-03-22 09:05:01 122     Vigilance   2023-03-22 09:04:58   2023-03-22 09:05:03
2023-03-22 09:05:02 125     Vigilance   2023-03-22 09:04:58   2023-03-22 09:05:03
2023-03-22 09:05:03 110     Forage      2023-03-22 09:05:03   2023-03-22 09:05:10
r timestamp data.table left-join tidyr
3个回答
1
投票

如果

allsecsHR
中的所有时间戳在
bhr
中都有对应的区间:

library(data.table)

bhr[allsecsHR[, ts := timestamp], on = .(BhrTimeStart == ts), roll = TRUE]
#>           Bhr        BhrTimeStart          BhrTimeEnd           timestamp HRbpm
#>  1:    Forage 2023-03-22 09:04:53 2023-03-22 09:04:58 2023-03-22 09:04:53   101
#>  2:    Forage 2023-03-22 09:04:54 2023-03-22 09:04:58 2023-03-22 09:04:54   124
#>  3:    Forage 2023-03-22 09:04:55 2023-03-22 09:04:58 2023-03-22 09:04:55   103
#>  4:    Forage 2023-03-22 09:04:56 2023-03-22 09:04:58 2023-03-22 09:04:56   111
#>  5:    Forage 2023-03-22 09:04:57 2023-03-22 09:04:58 2023-03-22 09:04:57   112
#>  6: Vigilance 2023-03-22 09:04:58 2023-03-22 09:05:03 2023-03-22 09:04:58   143
#>  7: Vigilance 2023-03-22 09:04:59 2023-03-22 09:05:03 2023-03-22 09:04:59   109
#>  8: Vigilance 2023-03-22 09:05:00 2023-03-22 09:05:03 2023-03-22 09:05:00   129
#>  9: Vigilance 2023-03-22 09:05:01 2023-03-22 09:05:03 2023-03-22 09:05:01   122
#> 10: Vigilance 2023-03-22 09:05:02 2023-03-22 09:05:03 2023-03-22 09:05:02   125
#> 11:    Forage 2023-03-22 09:05:03 2023-03-22 09:05:10 2023-03-22 09:05:03   110

数据:

allsecHR <- structure(list(timestamp = structure(c(1679490293, 1679490294, 
1679490295, 1679490296, 1679490297, 1679490298, 1679490299, 1679490300, 
1679490301, 1679490302, 1679490303), class = c("POSIXct", "POSIXt"
), tzone = ""), HRbpm = c(101L, 124L, 103L, 111L, 112L, 143L, 
109L, 129L, 122L, 125L, 110L), ts = structure(c(1679490293, 1679490294, 
1679490295, 1679490296, 1679490297, 1679490298, 1679490299, 1679490300, 
1679490301, 1679490302, 1679490303), class = c("POSIXct", "POSIXt"
), tzone = "")), row.names = c(NA, -11L), class = c("data.table", 
"data.frame")

bhr <- structure(list(Bhr = c("Forage", "Vigilance", "Forage"), BhrTimeStart = structure(c(1679490293, 
1679490298, 1679490303), class = c("POSIXct", "POSIXt"), tzone = ""), 
    BhrTimeEnd = structure(c(1679490298, 1679490303, 1679490310
    ), class = c("POSIXct", "POSIXt"), tzone = "")), row.names = c(NA, 
-3L), class = c("data.table", "data.frame"))


0
投票

可能有一个更好的解决方案,无需扩展,但是像这样的东西会起作用吗?

library(tidyverse)

df1 <- tibble(timestamp = 
                ymd_hms("2023-03-22 09:04:53") + 
                dseconds(0:3),
              HRbpm = c(101, 124, 103, 111))
df1                          

df2 <- tibble(Behaviour = c("Forage", "Vigilance", "Forage"),
              BhrTimeStart = ymd_hms(c("2023-03-22 09:04:53",
                                "2023-03-22 09:04:58",
                                "2023-03-22 09:05:03")),
              BhrTimeEnd = ymd_hms(c("2023-03-22 09:04:58",
                              "2023-03-22 09:05:03",
                              "2023-03-22 09:05:10")))

df2$size <- as.double(df2$BhrTimeEnd - df2$BhrTimeStart) + 1

df3 <- tibble(behaviour = rep(df2$Behaviour, df2$size),
              timestamp = as.POSIXct(
                sequence(
                  df2$size, 
                  from = as.double(df2$BhrTimeStart), 
                  by = 1
                )
              , 
              origin = origin
              ))

df3$BhrTimeStart <- rep(df2$BhrTimeStart, df2$size)
df3$BhrTimeEnd <- rep(df2$BhrTimeEnd, df2$size)

df1 %>%
  left_join(df3, by = "timestamp")

您可以将

df1
df2
替换为您的数据集。


0
投票

这可能有用:

library(dplyr)
library(lubridate)



# Sample data for allsecsHR
allsecsHR <- data.frame(
  timestamp = ymd_hms(c("2023-03-22 09:04:53", "2023-03-22 09:04:54", "2023-03-22 09:04:55",
                        "2023-03-22 09:04:56", "2023-03-22 09:04:57", "2023-03-22 09:04:58",
                        "2023-03-22 09:04:59", "2023-03-22 09:05:00", "2023-03-22 09:05:01",
                        "2023-03-22 09:05:02", "2023-03-22 09:05:03", "2023-07-22 09:05:03"
                        )),
  HRbpm = c(101, 124, 103, 111, 112, 143, 109, 129, 122, 125, 110, 202)
)


# Sample data for bhr
bhr <- data.frame(
  Behaviour = c("Forage", "Vigilance", "Keks"),
  BhrTimeStart = ymd_hms(c("2023-03-22 09:04:53", "2023-03-22 09:04:58", "2023-03-22 09:05:03")),
  BhrTimeEnd = ymd_hms(c("2023-03-22 09:04:58", "2023-03-22 09:05:03", "2023-03-22 09:05:10"))
)


# Function to check if timestamp is within BhrTimeStart and BhrTimeEnd
check_within_interval <- function(timestamp) {
  idx <- which(timestamp >= bhr$BhrTimeStart & timestamp < bhr$BhrTimeEnd)
  if (length(idx) > 0) {
    return(bhr$Behaviour[idx[1]])
  } else {
    return(NA)
  }
}

# Adding a new column 'Behaviour' to allsecsHR based on the condition
allsecsHR$Behaviour <- sapply(allsecsHR$timestamp, check_within_interval)

print(allsecsHR)

给我以下输出:

> print(allsecsHR)
             timestamp HRbpm Behaviour
1  2023-03-22 09:04:53   101    Forage
2  2023-03-22 09:04:54   124    Forage
3  2023-03-22 09:04:55   103    Forage
4  2023-03-22 09:04:56   111    Forage
5  2023-03-22 09:04:57   112    Forage
6  2023-03-22 09:04:58   143 Vigilance
7  2023-03-22 09:04:59   109 Vigilance
8  2023-03-22 09:05:00   129 Vigilance
9  2023-03-22 09:05:01   122 Vigilance
10 2023-03-22 09:05:02   125 Vigilance
11 2023-03-22 09:05:03   110      Keks
12 2023-07-22 09:05:03   202      <NA>

请注意,我为 allsecsHR 添加了一个未在 bhr 中表示的日期,以显示对 NA 的处理。 Keks 是另一种表现 < values.

处理的行为
© www.soinside.com 2019 - 2024. All rights reserved.