考虑以下的MWE。
# install.packages(RSQLite)
library(DBI)
con <- dbConnect(RSQLite::SQLite(), ":memory:")
df <- structure(list(date = c("2009-03-31 00:00:00", "2009-03-31 00:00:00",
"2009-03-31 00:00:00", "2009-03-31 00:00:00", "2009-03-31 00:00:00",
"2009-03-31 00:00:00", "2009-03-31 00:00:00", "2009-03-31 00:00:00",
"2009-03-31 00:00:00", "2014-11-21 00:00:00", "2014-11-21 00:00:00",
"2014-11-21 00:00:00", "2014-11-21 00:00:00", "2009-03-31 00:00:00",
"2009-03-31 00:00:00", "2009-03-31 00:00:00", "2009-03-31 00:00:00",
"2009-03-31 00:00:00", "2014-12-19 00:00:00", "2009-03-31 00:00:00",
"2009-03-31 00:00:00", "2009-03-31 00:00:00", "2014-12-19 00:00:00",
"2014-12-19 00:00:00", "2014-12-19 00:00:00"), id = c("000000704102",
"000000296498", "000000072683", "000000249805", "000000222470",
"000000148848", "000000703290", "000000109543", "000000186798",
"000000762610", "000000762298", "000000762609", "000000762505",
"000000209068", "000000702864", "000000302860", "000000305438",
"000000244137", "000000776505", "000000226051", "000000215632",
"000000219317", "000000191577", "000000166322", "000000057367"
), status = c("1", "1", "1", "1", "2", "1", "1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",
"1", "1"), historic = c("2010-12-31", "2010-12-31", NA, "2009-06-30",
"2011-06-30", "2009-12-31", "2009-09-30", NA, "2012-09-30", "2016-04-08",
"2016-04-08", "2016-04-08", "2017-06-23", "2010-09-30", "2009-09-30",
"2010-12-31", "2014-06-06", "2010-12-31", "2015-07-17", "2010-12-31",
"2012-09-30", "2012-03-31", NA, NA, "2016-03-31"), adr = c("00301",
"00301", "003014", "003014", "00306", "0040101", "004025", "004026",
"004027", "38255", "38255", "38255", "383585", "004032", "004036",
"004037", "004037", "00404", "44107", "004042", "004043", "004043",
"453518", "45357", "456012"), key = c("31032009100000070410200301",
"31032009100000029649800301", "310320091000000072683003014",
"310320091000000249805003014", "31032009200000022247000306",
"3103200910000001488480040101", "310320091000000703290004025",
"310320091000000109543004026", "310320091000000186798004027",
"21112014100000076261038255", "21112014100000076229838255", "21112014100000076260938255",
"211120141000000762505383585", "310320091000000209068004032",
"310320091000000702864004036", "310320091000000302860004037",
"310320091000000305438004037", "31032009100000024413700404",
"19122014100000077650544107", "310320091000000226051004042",
"310320091000000215632004043", "310320091000000219317004043",
"191220141000000191577453518", "19122014100000016632245357",
"191220141000000057367456012")), row.names = c(NA, -25L), class = "data.frame")
dbWriteTable(con, "tbl", df, overwrite = T)
dbGetQuery(con, "
SELECT *
FROM tbl
WHERE status == '1' AND key IN (
SELECT key
FROM tbl
WHERE date <= '2020-05-08' AND (historic IS NULL OR historic > '2020-05-08')
GROUP BY id, adr, historic, status
)
")
我的真实数据集包含大约70列和90万行,在MS SQL Server上。使用以下方法运行上述查询 dbGetQuery
的真实数据上,需要花费大量的时间。代替 IN
筛选 LEFT JOIN
也不是很有效率。我正在寻找一种使用 dplyr
或建议最有效的方法。请注意: date
过滤器将在以后被参数化。
这可能会有帮助。
library(dplyr)
tbl %>%
mutate_at(vars(date, historic), as.Date) %>%
filter(status == 1 & key %in% (
tbl %>%
filter(date <= as.Date('2020-05-08') &
(is.na(historic) | historic > as.Date('2020-05-08'))) %>%
pull(key) %>% unique))
# date id status historic adr key
#1 2009-03-31 00:00:00 000000072683 1 <NA> 003014 310320091000000072683003014
#2 2009-03-31 00:00:00 000000109543 1 <NA> 004026 310320091000000109543004026
#3 2014-12-19 00:00:00 000000191577 1 <NA> 453518 191220141000000191577453518
#4 2014-12-19 00:00:00 000000166322 1 <NA> 45357 19122014100000016632245357