根据POSIXct值组合两个数据集(数据集的长度不相等!)

问题描述 投票:0回答:1

我有与我asked before几乎相同的问题。

但是,由于较长的第二个数据帧,解决方案似乎无法正常工作。

我想要什么?

1)。我想使用以下逻辑向Dataset1添加一个二进制列(是/否):如果Dataset2中有一行,其中channel == channel,并且Time在“ before”和“ after”时间之内,具有“是”。否则为“否”。

2)。如果二进制列==是,我想从data2添加“ info”列。

关键是,我的最终数据集将不长于Data1(就行而言)。

> head(data1)
   Channel         before_time          after_time
1:      ve 2019-09-02 20:13:00 2019-09-02 20:43:00
2:      sb 2019-09-02 20:37:00 2019-09-02 21:07:00
3:      ne 2019-09-02 21:12:00 2019-09-02 21:42:00
4:      sb 2019-09-02 21:34:00 2019-09-02 22:04:00
5:      sb 2019-09-02 23:07:00 2019-09-02 23:37:00
6:      sb 2019-09-03 19:59:00 2019-09-03 20:29:00


> head(data2)
   ChannelB     Start_Time_comp info
1:       zi 2019-11-01 02:02:00    1
2:       zi 2019-11-01 02:57:00    2
3:       zi 2019-11-01 03:26:00    3
4:       zs 2019-11-01 04:27:00    4
5:       zi 2019-11-01 04:26:00    5
6:       zi 2019-11-01 04:56:00    6

通过我的尝试,我的数据帧变得非常大(同时我想拥有Data1 +两列(基于Data2)。

此解决方案是我从上一个问题中得到的:

x <- data1 %>%
  left_join(data2,
            by = c('Channel' = 'ChannelB')
  ) %>%
  mutate(
    Available = ifelse(
      !is.na(Start_Time_comp) & Start_Time_comp >= before_time & Start_Time_comp <= after_time, 'Yes', 'No'),
    info = ifelse(
      Available == 'Yes', as.character(info), 'x')
  ) %>%
  select(-Start_Time_comp)

基于此(它代表了很好的信息,但是,行数大量增加了!:

 head(x)
  Channel         before_time          after_time info Available
1      ve 2019-09-02 20:13:00 2019-09-02 20:43:00    x        No
2      ve 2019-09-02 20:13:00 2019-09-02 20:43:00    x        No
3      ve 2019-09-02 20:13:00 2019-09-02 20:43:00    x        No
4      ve 2019-09-02 20:13:00 2019-09-02 20:43:00    x        No
5      ve 2019-09-02 20:13:00 2019-09-02 20:43:00    x        No
6      ve 2019-09-02 20:13:00 2019-09-02 20:43:00    x        No

要查看行数:

> dim(data1)
[1] 1982    3
> dim(data2)
[1] 3000    3
> dim(x)
[1] 322912      5    # this should be 1982...

数据1:

> dput(data1[1:100,])
structure(list(Channel = c("ve", "sb", "ne", "sb", "sb", "sb", 
"rt", "ne", "np", "ne", "rt", "sb", "rt", "rt", "sb", "rt", "sb", 
"sb", "sb", "ve", "rt", "rt", "rt", "sb", "np", "sb", "np", "ne", 
"ve", "rt", "ne", "rt", "np", "rt", "rt", "sb", "ve", "rt", "sb", 
"rt", "ne", "ve", "rt", "ne", "sb", "sb", "ve", "sb", "ve", "sb", 
"ve", "np", "rt", "rt", "ne", "sb", "rt", "ve", "rt", "rt", "sb", 
"rt", "np", "sb", "rt", "np", "ve", "ne", "rt", "rt", "ve", "rt", 
"sb", "sb", "rt", "ve", "sb", "rt", "rt", "sb", "rt", "sb", "rt", 
"ve", "sb", "ne", "ve", "ve", "np", "ve", "rt", "rt", "ve", "np", 
"rt", "rt", "rt", "ve", "rt", "ne"), before_time = structure(c(1567455180, 
1567456620, 1567458720, 1567460040, 1567465620, 1567540740, 1567544520, 
1567547760, 1567548540, 1567549920, 1567549980, 1567630740, 1567631100, 
1567634820, 1567635240, 1567636800, 1567638360, 1567715520, 1567720440, 
1567720800, 1567721040, 1567723140, 1567725900, 1567800060, 1567801500, 
1567804980, 1567805340, 1567805580, 1567805760, 1567807440, 1567809240, 
1567809660, 1567810980, 1567813320, 1567885200, 1567893000, 1567894260, 
1567898160, 1567974600, 1567976100, 1567976340, 1567977060, 1567978560, 
1567978560, 1567979520, 1568059080, 1568059380, 1568061480, 1568063340, 
1568066040, 1568145780, 1568146620, 1568152620, 1568154900, 1568231760, 
1568235540, 1568236020, 1568236200, 1568236200, 1568239440, 1568240280, 
1568313120, 1568319420, 1568325540, 1568325840, 1568408820, 1568410200, 
1568411700, 1568412240, 1568414220, 1568491440, 1568492760, 1568494560, 
1568496600, 1568497020, 1568497140, 1568498160, 1568576280, 1568581020, 
1568581020, 1568583420, 1568584980, 1568587140, 1568664780, 1568666280, 
1568666340, 1568669460, 1568750640, 1568751780, 1568752620, 1568754060, 
1568759400, 1568760060, 1568838240, 1568839200, 1568842800, 1568845380, 
1568846640, 1568927160, 1568927220), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), after_time = structure(c(1567456980, 1567458420, 
1567460520, 1567461840, 1567467420, 1567542540, 1567546320, 1567549560, 
1567550340, 1567551720, 1567551780, 1567632540, 1567632900, 1567636620, 
1567637040, 1567638600, 1567640160, 1567717320, 1567722240, 1567722600, 
1567722840, 1567724940, 1567727700, 1567801860, 1567803300, 1567806780, 
1567807140, 1567807380, 1567807560, 1567809240, 1567811040, 1567811460, 
1567812780, 1567815120, 1567887000, 1567894800, 1567896060, 1567899960, 
1567976400, 1567977900, 1567978140, 1567978860, 1567980360, 1567980360, 
1567981320, 1568060880, 1568061180, 1568063280, 1568065140, 1568067840, 
1568147580, 1568148420, 1568154420, 1568156700, 1568233560, 1568237340, 
1568237820, 1568238000, 1568238000, 1568241240, 1568242080, 1568314920, 
1568321220, 1568327340, 1568327640, 1568410620, 1568412000, 1568413500, 
1568414040, 1568416020, 1568493240, 1568494560, 1568496360, 1568498400, 
1568498820, 1568498940, 1568499960, 1568578080, 1568582820, 1568582820, 
1568585220, 1568586780, 1568588940, 1568666580, 1568668080, 1568668140, 
1568671260, 1568752440, 1568753580, 1568754420, 1568755860, 1568761200, 
1568761860, 1568840040, 1568841000, 1568844600, 1568847180, 1568848440, 
1568928960, 1568929020), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA, 
100L), class = c("data.table", "data.frame"))

和数据2:

> dput(data2[1:150,])
structure(list(ChannelB = c("zi", "zi", "zi", "zs", "zi", "zi", 
"xi", "xi", "xi", "xi", "eu", "xi", "rt", "xi", "xi", "mt", "id", 
"ge", "xi", "xi", "ge", "np", "zs", "tl", "xi", "fo", "fs", "rt", 
"id", "ge", "xi", "rt", "fo", "rt", "zs", "ge", "xi", "id", "xi", 
"id", "xi", "xi", "tl", "fs", "xi", "fs", "id", "rt", "fo", "co", 
"rt", "rt", "zi", "xi", "rt", "xi", "id", "rt", "zi", "xi", "rt", 
"fs", "rt", "rt", "id", "rt", "zs", "zi", "rt", "fs", "zs", "np", 
"rt", "rt", "rt", "ge", "ge", "rt", "24", "rt", "mt", "tl", "rt", 
"ge", "rt", "id", "rt", "fs", "sp", "fo", "zi", "zs", "ge", "tl", 
"di", "24", "zs", "zi", "tl", "np", "id", "di", "fs", "sp", "fo", 
"fs", "hi", "di", "sp", "bb", "ge", "np", "hi", "fo", "hi", "rt", 
"hi", "zi", "zs", "rt", "hi", "id", "xi", "di", "zi", "id", "fo", 
"xi", "rt", "zi", "hi", "hi", "np", "fs", "rt", "tl", "rt", "eu", 
"24", "tl", "fs", "fs", "xi", "co", "hi", "zs", "zi", "mt", "co", 
"24"), Start_Time_comp = structure(c(1572573720, 1572577020, 
1572578760, 1572582420, 1572582360, 1572584160, 1572588600, 1572589680, 
1572591120, 1572592440, 1572592860, 1572593580, 1572594060, 1572594720, 
1572596100, 1572596280, 1572597060, 1572597000, 1572597120, 1572598260, 
1572598140, 1572599280, 1572599460, 1572599640, 1572599520, 1572599580, 
1572600240, 1572600420, 1572600300, 1572600360, 1572600540, 1572601260, 
1572601320, 1572601260, 1572601260, 1572601560, 1572601740, 1572602640, 
1572602880, 1572603720, 1572603840, 1572604980, 1572605040, 1572605040, 
1572606600, 1572606720, 1572607140, 1572606900, 1572607440, 1572608460, 
1572608580, 1572608580, 1572609000, 1572610440, 1572610860, 1572611400, 
1572611880, 1572612360, 1572613320, 1572615060, 1572615360, 1572616380, 
1572617520, 1572619380, 1572619440, 1572621120, 1572623520, 1572625260, 
1572626160, 1572626760, 1572627000, 1572627900, 1572628380, 1572629760, 
1572630120, 1572630120, 1572630120, 1572630420, 1572630660, 1572630600, 
1572631800, 1572632040, 1572631920, 1572632280, 1572633420, 1572635400, 
1572635340, 1572635400, 1572635940, 1572636120, 1572636720, 1572636540, 
1572636900, 1572637260, 1572637260, 1572637560, 1572638280, 1572638400, 
1572639000, 1572640080, 1572641160, 1572640980, 1572641280, 1572641520, 
1572641640, 1572645300, 1572646200, 1572646320, 1572646500, 1572646620, 
1572646620, 1572647160, 1572647160, 1572646860, 1572648240, 1572648840, 
1572649740, 1572649860, 1572649860, 1572650220, 1572650460, 1572650760, 
1572650880, 1572651540, 1572651660, 1572651960, 1572651780, 1572651960, 
1572652560, 1572753480, 1572762480, 1572763200, 1572765000, 1572765420, 
1572765300, 1572766080, 1572768300, 1572769500, 1572770400, 1572858540, 
1572859020, 1572860700, 1572860940, 1572947520, 1572947880, 1572951420, 
1573037760, 1573038540, 1573038900, 1573038840), class = c("POSIXct", 
"POSIXt"), tzone = "UTC"), info = 1:150), row.names = c(NA, 150L
), class = c("data.table", "data.frame"))
r dplyr data.table data-manipulation
1个回答
1
投票

这个?

data1 %>% 
     left_join(data2, by = c("Channel" = "ChannelB")) %>% 
   mutate(Available = ifelse(Start_Time_comp < after_time & Start_Time_comp> before_time, 1, 0)) %>% 
   group_by(Channel, before_time, after_time) %>% 
   summarise(Available = sum(Available, na.rm = T)) %>% 
   ungroup() %>% 
   mutate(Available = ifelse(Available > 0, "Yes", "No"))
# A tibble: 100 x 4
   Channel before_time         after_time          Available
   <chr>   <dttm>              <dttm>              <chr>    
 1 ne      2019-09-02 21:12:00 2019-09-02 21:42:00 No       
 2 ne      2019-09-03 21:56:00 2019-09-03 22:26:00 No       
 3 ne      2019-09-03 22:32:00 2019-09-03 23:02:00 No       
 4 ne      2019-09-06 21:33:00 2019-09-06 22:03:00 No       
 5 ne      2019-09-06 22:34:00 2019-09-06 23:04:00 No       
 6 ne      2019-09-08 20:59:00 2019-09-08 21:29:00 No       
 7 ne      2019-09-08 21:36:00 2019-09-08 22:06:00 No       
 8 ne      2019-09-11 19:56:00 2019-09-11 20:26:00 No       
 9 ne      2019-09-13 21:55:00 2019-09-13 22:25:00 No       
10 ne      2019-09-16 20:39:00 2019-09-16 21:09:00 No  
© www.soinside.com 2019 - 2024. All rights reserved.