我有一个两列的数据框,一列(时间)包含图像的时间戳,另一列(状态)包含图像上显示的状态。这些数据框可以跨越几周,并且每天有数百行。连续行中的“状态”可以更改,但不必更改。
示例:
时间 | 状态 |
---|---|
20220526173731 | 毛孔 |
20220526173741 | 毛孔 |
20220526173746 | SJEDI |
20220526175242 | 毛孔 |
20220526175246 | SJEDI |
20220526175806 | SJEDI |
20220526175810 | SJEDI |
20220526175818 | NEMA |
20220526175819 | SJEDI |
20220526175819 | SJEDI |
20220526175822 | SJEDI |
20220526180013 | SJEDI |
20220522071053 | NEMA |
20220522071056 | NEMA |
我想做的是使用R计算整个期间每个状态的持续时间(以秒为单位)、每天每个状态的持续时间(以秒为单位)以及之前每个状态的持续时间(以秒为单位)它会更改为下一个状态。 另外,我想计算整个期间和每天状态变化的次数。
到目前为止我做了什么:
library(dplyr)
library(data.table)
# input dhe data frame
dat <- read.csv("Lala.csv")
dat$Time <- as.character(dat$Time)
dat$Time <- as.POSIXct(dat$Time, format="%Y%m%d%H%M%S", tz="CET")
dat$Time
dat$State <- as.factor(dat$State)
# give each state run a unique number
setDT(dat)
dat[, state_run := cumsum(c(TRUE, diff(as.integer(dat$State)) != 0L))]
head(dat,20)
# calculate the duration of each state
dat2 <- dat[, list(StartTime = min(Time),
State = State[1],
Duration = diff(range(Time))), by = state_run]
不幸的是,这并没有给出每个状态的持续时间,它给出了从每个状态的第一行到每个状态的最后一行的持续时间,这不是状态的结束。 如果是下一个状态的第一行,则状态结束。
感谢您的帮助!
这看起来像是
data.table
中的简单总结。
Time
转换为 POSIXt
对象,以便我们可以依赖时间差异(其中有些是多天......虽然这里没有出现,但如果有任何内容跨越不同的月份,您的数学不会这是错误的)。Day
。Time
排序,这样我们就可以计算diff
,每行与下一行之间的秒数。 (最后一行的持续时间未知,由您来了解如何知道最后一行持续多长时间。)library(data.table)
setDT(quux)
quux[, Time := as.POSIXct(as.character(Time), format="%Y%m%d%H%M%S")
][, Day := as.Date(Time)]
# Time State Day
# <POSc> <char> <Date>
# 1: 2022-05-26 17:37:31 PORED 2022-05-26
# 2: 2022-05-26 17:37:41 PORED 2022-05-26
# 3: 2022-05-26 17:37:46 SJEDI 2022-05-26
# 4: 2022-05-26 17:52:42 PORED 2022-05-26
# 5: 2022-05-26 17:52:46 SJEDI 2022-05-26
# 6: 2022-05-26 17:58:06 SJEDI 2022-05-26
# 7: 2022-05-26 17:58:10 SJEDI 2022-05-26
# 8: 2022-05-26 17:58:18 NEMA 2022-05-26
# 9: 2022-05-26 17:58:19 SJEDI 2022-05-26
# 10: 2022-05-26 17:58:19 SJEDI 2022-05-26
# 11: 2022-05-26 17:58:22 SJEDI 2022-05-26
# 12: 2022-05-26 18:00:13 SJEDI 2022-05-26
# 13: 2022-05-22 07:10:53 NEMA 2022-05-22
# 14: 2022-05-22 07:10:56 NEMA 2022-05-22
setorder(quux, Time)
quux[, diff := c(as.numeric(diff(Time), units = "secs"), NA)]
# Time State Day diff
# <POSc> <char> <Date> <num>
# 1: 2022-05-22 07:10:53 NEMA 2022-05-22 3
# 2: 2022-05-22 07:10:56 NEMA 2022-05-22 383195
# 3: 2022-05-26 17:37:31 PORED 2022-05-26 10
# 4: 2022-05-26 17:37:41 PORED 2022-05-26 5
# 5: 2022-05-26 17:37:46 SJEDI 2022-05-26 896
# 6: 2022-05-26 17:52:42 PORED 2022-05-26 4
# 7: 2022-05-26 17:52:46 SJEDI 2022-05-26 320
# 8: 2022-05-26 17:58:06 SJEDI 2022-05-26 4
# 9: 2022-05-26 17:58:10 SJEDI 2022-05-26 8
# 10: 2022-05-26 17:58:18 NEMA 2022-05-26 1
# 11: 2022-05-26 17:58:19 SJEDI 2022-05-26 0
# 12: 2022-05-26 17:58:19 SJEDI 2022-05-26 3
# 13: 2022-05-26 17:58:22 SJEDI 2022-05-26 111
# 14: 2022-05-26 18:00:13 SJEDI 2022-05-26 NA
您的总结:
quux[, sum(diff, na.rm = TRUE), by = .(State)]
# State V1
# <char> <num>
# 1: NEMA 383199
# 2: PORED 19
# 3: SJEDI 1342
quux[, sum(diff, na.rm = TRUE), by = .(State, Day)]
# State Day V1
# <char> <Date> <num>
# 1: NEMA 2022-05-22 383198
# 2: PORED 2022-05-26 19
# 3: SJEDI 2022-05-26 1342
# 4: NEMA 2022-05-26 1
quux[, sum(diff, na.rm = TRUE), by = .(rleid(State), State)]
# rleid State V1
# <int> <char> <num>
# 1: 1 NEMA 383198
# 2: 2 PORED 15
# 3: 3 SJEDI 896
# 4: 4 PORED 4
# 5: 5 SJEDI 332
# 6: 6 NEMA 1
# 7: 7 SJEDI 114
quux[, sum(State[-1] != State[-.N]), by = Day]
# Day V1
# <Date> <int>
# 1: 2022-05-22 0
# 2: 2022-05-26 5
起始数据:
quux <- structure(list(Time = c(20220526173731, 20220526173741, 20220526173746, 20220526175242, 20220526175246, 20220526175806, 20220526175810, 20220526175818, 20220526175819, 20220526175819, 20220526175822, 20220526180013, 20220522071053, 20220522071056), State = c("PORED", "PORED", "SJEDI", "PORED", "SJEDI", "SJEDI", "SJEDI", "NEMA", "SJEDI", "SJEDI", "SJEDI", "SJEDI", "NEMA", "NEMA")), class = "data.frame", row.names = c(NA, -14L))