如何在dplyr::filter中使用子查询。

问题描述 投票:0回答:1

考虑以下的MWE。

# install.packages(RSQLite)
library(DBI)
con <- dbConnect(RSQLite::SQLite(), ":memory:")

df <- structure(list(date = c("2009-03-31 00:00:00", "2009-03-31 00:00:00", 
                              "2009-03-31 00:00:00", "2009-03-31 00:00:00", "2009-03-31 00:00:00", 
                              "2009-03-31 00:00:00", "2009-03-31 00:00:00", "2009-03-31 00:00:00", 
                              "2009-03-31 00:00:00", "2014-11-21 00:00:00", "2014-11-21 00:00:00", 
                              "2014-11-21 00:00:00", "2014-11-21 00:00:00", "2009-03-31 00:00:00", 
                              "2009-03-31 00:00:00", "2009-03-31 00:00:00", "2009-03-31 00:00:00", 
                              "2009-03-31 00:00:00", "2014-12-19 00:00:00", "2009-03-31 00:00:00", 
                              "2009-03-31 00:00:00", "2009-03-31 00:00:00", "2014-12-19 00:00:00", 
                              "2014-12-19 00:00:00", "2014-12-19 00:00:00"), id = c("000000704102", 
                                                                                    "000000296498", "000000072683", "000000249805", "000000222470", 
                                                                                    "000000148848", "000000703290", "000000109543", "000000186798", 
                                                                                    "000000762610", "000000762298", "000000762609", "000000762505", 
                                                                                    "000000209068", "000000702864", "000000302860", "000000305438", 
                                                                                    "000000244137", "000000776505", "000000226051", "000000215632", 
                                                                                    "000000219317", "000000191577", "000000166322", "000000057367"
                              ), status = c("1", "1", "1", "1", "2", "1", "1", "1", "1", "1", 
                                            "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", 
                                            "1", "1"), historic = c("2010-12-31", "2010-12-31", NA, "2009-06-30", 
                                                                    "2011-06-30", "2009-12-31", "2009-09-30", NA, "2012-09-30", "2016-04-08", 
                                                                    "2016-04-08", "2016-04-08", "2017-06-23", "2010-09-30", "2009-09-30", 
                                                                    "2010-12-31", "2014-06-06", "2010-12-31", "2015-07-17", "2010-12-31", 
                                                                    "2012-09-30", "2012-03-31", NA, NA, "2016-03-31"), adr = c("00301", 
                                                                                                                               "00301", "003014", "003014", "00306", "0040101", "004025", "004026", 
                                                                                                                               "004027", "38255", "38255", "38255", "383585", "004032", "004036", 
                                                                                                                               "004037", "004037", "00404", "44107", "004042", "004043", "004043", 
                                                                                                                               "453518", "45357", "456012"), key = c("31032009100000070410200301", 
                                                                                                                                                                     "31032009100000029649800301", "310320091000000072683003014", 
                                                                                                                                                                     "310320091000000249805003014", "31032009200000022247000306", 
                                                                                                                                                                     "3103200910000001488480040101", "310320091000000703290004025", 
                                                                                                                                                                     "310320091000000109543004026", "310320091000000186798004027", 
                                                                                                                                                                     "21112014100000076261038255", "21112014100000076229838255", "21112014100000076260938255", 
                                                                                                                                                                     "211120141000000762505383585", "310320091000000209068004032", 
                                                                                                                                                                     "310320091000000702864004036", "310320091000000302860004037", 
                                                                                                                                                                     "310320091000000305438004037", "31032009100000024413700404", 
                                                                                                                                                                     "19122014100000077650544107", "310320091000000226051004042", 
                                                                                                                                                                     "310320091000000215632004043", "310320091000000219317004043", 
                                                                                                                                                                     "191220141000000191577453518", "19122014100000016632245357", 
                                                                                                                                                                     "191220141000000057367456012")), row.names = c(NA, -25L), class = "data.frame")
dbWriteTable(con, "tbl", df, overwrite = T)


dbGetQuery(con, "
SELECT * 
FROM tbl
WHERE status == '1' AND key IN (
  SELECT key
  FROM tbl 
  WHERE date <= '2020-05-08' AND (historic IS NULL OR historic > '2020-05-08')
  GROUP BY id, adr, historic, status
)
")

我的真实数据集包含大约70列和90万行,在MS SQL Server上。使用以下方法运行上述查询 dbGetQuery 的真实数据上,需要花费大量的时间。代替 IN 筛选 LEFT JOIN 也不是很有效率。我正在寻找一种使用 dplyr 或建议最有效的方法。请注意: date 过滤器将在以后被参数化。

sql r dplyr
1个回答
0
投票

这可能会有帮助。

library(dplyr)

tbl %>%
  mutate_at(vars(date, historic), as.Date) %>%
  filter(status == 1 & key %in% (
           tbl %>%
             filter(date <= as.Date('2020-05-08') & 
                    (is.na(historic) | historic > as.Date('2020-05-08'))) %>%
              pull(key) %>% unique))


#                 date           id status historic    adr                         key
#1 2009-03-31 00:00:00 000000072683      1     <NA> 003014 310320091000000072683003014
#2 2009-03-31 00:00:00 000000109543      1     <NA> 004026 310320091000000109543004026
#3 2014-12-19 00:00:00 000000191577      1     <NA> 453518 191220141000000191577453518
#4 2014-12-19 00:00:00 000000166322      1     <NA>  45357 19122014100000016632245357
© www.soinside.com 2019 - 2024. All rights reserved.