在 R 中的时间戳范围内时左连接大型时间序列数据集

Question

我需要将两个数据集连接在一起：

每秒采集一次心率测量的时间序列数据（有 140 万条记录）

>allsecsHR
timestamp               HRbpm
2023-03-22 09:04:53     101
2023-03-22 09:04:54     124
2023-03-22 09:04:55     103
2023-03-22 09:04:56     111
2023-03-22 09:04:57     112
2023-03-22 09:04:58     143
2023-03-22 09:04:59     109
2023-03-22 09:05:00     129
2023-03-22 09:05:01     122
2023-03-22 09:05:02     125
2023-03-22 09:05:03     110

带有开始和结束时间戳的行为记录

>bhr
Behaviour  BhrTimeStart         BhrTimeEnd
Forage     2023-03-22 09:04:53  2023-03-22 09:04:58
Vigilance  2023-03-22 09:04:58  2023-03-22 09:05:03
Forage     2023-03-22 09:05:03  2023-03-22 09:05:10

最后我想要一个数据集，其中每行代表一秒，以及每秒执行的行为

我尝试在 mySQL 和 R 中使用 sqldf 和 powerjoin 执行此操作，但运行 2 小时后它不断失去与 sql 服务器的连接，或者在 R 中它无法完成并崩溃或返回

Error: vector memory exhausted (limit reached?)

。我真的很感谢一些帮助来找到有效的方法来做到这一点！

我认为 data.table 解决方案可能是最快的，但我不知道如何去做，一个整洁的解决方案也很棒！

到目前为止我已经尝试过：

mySQL/sqldf

library(sqldf)
sqldf("select * from allsecsHR
              left join bhr
              on allsecsHR.timestamp between bhr.BhrTimeStart and bhr.BhrTimeEnd")

强力加入

library(powerjoin)
power_left_join(
  allsecsHR, bhr, 
  by = ~.x$timestamp > .y$BhrTimeStart & 
    (.x$timestamp < .y$BhrTimeEnd | is.na(.y$BhrTimeEnd)),
  keep = "left")

我想要的输出表是这样的：

timestamp           HRbpm   Bhr         BhrTimeStart        BhrTimeEnd
2023-03-22 09:04:53 101     Forage      2023-03-22 09:04:53   2023-03-22 09:04:58
2023-03-22 09:04:54 124     Forage      2023-03-22 09:04:53   2023-03-22 09:04:58
2023-03-22 09:04:55 103     Forage      2023-03-22 09:04:53   2023-03-22 09:04:58
2023-03-22 09:04:56 111     Forage      2023-03-22 09:04:53   2023-03-22 09:04:58
2023-03-22 09:04:57 112     Forage      2023-03-22 09:04:53   2023-03-22 09:04:58
2023-03-22 09:04:58 143     Vigilance   2023-03-22 09:04:58   2023-03-22 09:05:03
2023-03-22 09:04:59 109     Vigilance   2023-03-22 09:04:58   2023-03-22 09:05:03
2023-03-22 09:05:00 129     Vigilance   2023-03-22 09:04:58   2023-03-22 09:05:03
2023-03-22 09:05:01 122     Vigilance   2023-03-22 09:04:58   2023-03-22 09:05:03
2023-03-22 09:05:02 125     Vigilance   2023-03-22 09:04:58   2023-03-22 09:05:03
2023-03-22 09:05:03 110     Forage      2023-03-22 09:05:03   2023-03-22 09:05:10

Answer 1

如果

allsecsHR

中的所有时间戳在

bhr

中都有对应的区间：

library(data.table)

bhr[allsecsHR[, ts := timestamp], on = .(BhrTimeStart == ts), roll = TRUE]
#>           Bhr        BhrTimeStart          BhrTimeEnd           timestamp HRbpm
#>  1:    Forage 2023-03-22 09:04:53 2023-03-22 09:04:58 2023-03-22 09:04:53   101
#>  2:    Forage 2023-03-22 09:04:54 2023-03-22 09:04:58 2023-03-22 09:04:54   124
#>  3:    Forage 2023-03-22 09:04:55 2023-03-22 09:04:58 2023-03-22 09:04:55   103
#>  4:    Forage 2023-03-22 09:04:56 2023-03-22 09:04:58 2023-03-22 09:04:56   111
#>  5:    Forage 2023-03-22 09:04:57 2023-03-22 09:04:58 2023-03-22 09:04:57   112
#>  6: Vigilance 2023-03-22 09:04:58 2023-03-22 09:05:03 2023-03-22 09:04:58   143
#>  7: Vigilance 2023-03-22 09:04:59 2023-03-22 09:05:03 2023-03-22 09:04:59   109
#>  8: Vigilance 2023-03-22 09:05:00 2023-03-22 09:05:03 2023-03-22 09:05:00   129
#>  9: Vigilance 2023-03-22 09:05:01 2023-03-22 09:05:03 2023-03-22 09:05:01   122
#> 10: Vigilance 2023-03-22 09:05:02 2023-03-22 09:05:03 2023-03-22 09:05:02   125
#> 11:    Forage 2023-03-22 09:05:03 2023-03-22 09:05:10 2023-03-22 09:05:03   110

数据：

allsecHR <- structure(list(timestamp = structure(c(1679490293, 1679490294, 
1679490295, 1679490296, 1679490297, 1679490298, 1679490299, 1679490300, 
1679490301, 1679490302, 1679490303), class = c("POSIXct", "POSIXt"
), tzone = ""), HRbpm = c(101L, 124L, 103L, 111L, 112L, 143L, 
109L, 129L, 122L, 125L, 110L), ts = structure(c(1679490293, 1679490294, 
1679490295, 1679490296, 1679490297, 1679490298, 1679490299, 1679490300, 
1679490301, 1679490302, 1679490303), class = c("POSIXct", "POSIXt"
), tzone = "")), row.names = c(NA, -11L), class = c("data.table", 
"data.frame")

bhr <- structure(list(Bhr = c("Forage", "Vigilance", "Forage"), BhrTimeStart = structure(c(1679490293, 
1679490298, 1679490303), class = c("POSIXct", "POSIXt"), tzone = ""), 
    BhrTimeEnd = structure(c(1679490298, 1679490303, 1679490310
    ), class = c("POSIXct", "POSIXt"), tzone = "")), row.names = c(NA, 
-3L), class = c("data.table", "data.frame"))

）

Answer 2

可能有一个更好的解决方案，无需扩展，但是像这样的东西会起作用吗？

library(tidyverse)

df1 <- tibble(timestamp = 
                ymd_hms("2023-03-22 09:04:53") + 
                dseconds(0:3),
              HRbpm = c(101, 124, 103, 111))
df1                          

df2 <- tibble(Behaviour = c("Forage", "Vigilance", "Forage"),
              BhrTimeStart = ymd_hms(c("2023-03-22 09:04:53",
                                "2023-03-22 09:04:58",
                                "2023-03-22 09:05:03")),
              BhrTimeEnd = ymd_hms(c("2023-03-22 09:04:58",
                              "2023-03-22 09:05:03",
                              "2023-03-22 09:05:10")))

df2$size <- as.double(df2$BhrTimeEnd - df2$BhrTimeStart) + 1

df3 <- tibble(behaviour = rep(df2$Behaviour, df2$size),
              timestamp = as.POSIXct(
                sequence(
                  df2$size, 
                  from = as.double(df2$BhrTimeStart), 
                  by = 1
                )
              , 
              origin = origin
              ))

df3$BhrTimeStart <- rep(df2$BhrTimeStart, df2$size)
df3$BhrTimeEnd <- rep(df2$BhrTimeEnd, df2$size)

df1 %>%
  left_join(df3, by = "timestamp")

您可以将

df1

和

df2

替换为您的数据集。

Answer 3

这可能有用：

library(dplyr)
library(lubridate)



# Sample data for allsecsHR
allsecsHR <- data.frame(
  timestamp = ymd_hms(c("2023-03-22 09:04:53", "2023-03-22 09:04:54", "2023-03-22 09:04:55",
                        "2023-03-22 09:04:56", "2023-03-22 09:04:57", "2023-03-22 09:04:58",
                        "2023-03-22 09:04:59", "2023-03-22 09:05:00", "2023-03-22 09:05:01",
                        "2023-03-22 09:05:02", "2023-03-22 09:05:03", "2023-07-22 09:05:03"
                        )),
  HRbpm = c(101, 124, 103, 111, 112, 143, 109, 129, 122, 125, 110, 202)
)


# Sample data for bhr
bhr <- data.frame(
  Behaviour = c("Forage", "Vigilance", "Keks"),
  BhrTimeStart = ymd_hms(c("2023-03-22 09:04:53", "2023-03-22 09:04:58", "2023-03-22 09:05:03")),
  BhrTimeEnd = ymd_hms(c("2023-03-22 09:04:58", "2023-03-22 09:05:03", "2023-03-22 09:05:10"))
)


# Function to check if timestamp is within BhrTimeStart and BhrTimeEnd
check_within_interval <- function(timestamp) {
  idx <- which(timestamp >= bhr$BhrTimeStart & timestamp < bhr$BhrTimeEnd)
  if (length(idx) > 0) {
    return(bhr$Behaviour[idx[1]])
  } else {
    return(NA)
  }
}

# Adding a new column 'Behaviour' to allsecsHR based on the condition
allsecsHR$Behaviour <- sapply(allsecsHR$timestamp, check_within_interval)

print(allsecsHR)

给我以下输出：

> print(allsecsHR)
             timestamp HRbpm Behaviour
1  2023-03-22 09:04:53   101    Forage
2  2023-03-22 09:04:54   124    Forage
3  2023-03-22 09:04:55   103    Forage
4  2023-03-22 09:04:56   111    Forage
5  2023-03-22 09:04:57   112    Forage
6  2023-03-22 09:04:58   143 Vigilance
7  2023-03-22 09:04:59   109 Vigilance
8  2023-03-22 09:05:00   129 Vigilance
9  2023-03-22 09:05:01   122 Vigilance
10 2023-03-22 09:05:02   125 Vigilance
11 2023-03-22 09:05:03   110      Keks
12 2023-07-22 09:05:03   202      <NA>

请注意，我为 allsecsHR 添加了一个未在 bhr 中表示的日期，以显示对 NA 的处理。 Keks 是另一种表现 < values.

处理的行为

在 R 中的时间戳范围内时左连接大型时间序列数据集

问题描述投票：0回答：3

3个回答

最新问题

在 R 中的时间戳范围内时左连接大型时间序列数据集

问题描述 投票：0回答：3

3个回答

最新问题

问题描述投票：0回答：3