R:如何将用户定义的函数应用于数据表的多列并在滑动窗口中求值,即移动平均值

问题描述 投票:0回答:1

我希望有一个滑动窗口,可以在大约130列的整个数据表上计算多个量(最小,最大,总计,平均值)。窗口在第一个键设置的数据组上滑动。以下代码适用于一列,但我不知道如何修改多列的代码。

# Rolling sums
# ref http://stackoverflow.com/questions/23501262/r-using-data-table-to-efficiently-test-rolling-conditions-across-multiple-rows
#
library(data.table)
#
# Generate sample data
#
set.seed(57)
min_nSamples=5
max_nSamples=20
repN<-function(un) {
  rep(un,sample(min_nSamples:max_nSamples,1))
}
nUnits=1
units<- unlist(lapply(1:nUnits,repN))
dt<-data.table(unit = units)
dt <- dt[,{
  ## if there's just one row in the group of ID's, return nothing
  list(
    index = as.double(seq(1,.N))
  )}, by = c("unit")]
dt[, var1:=runif(nrow(dt), -5, 5)]
if (FALSE) {
  dt[, var2:=runif(nrow(dt), 5, 10)]
  dt[, var3:=runif(nrow(dt), 10, 15)]
}
setkeyv(dt,c("unit","index"))


# Reference
# http://stackoverflow.com/questions/14937165/using-dynamic-column-names-in-data-table

# Get a complete map of units / indices
keys = key(dt)
key_dt = CJ(event=unique(dt$unit), index=min(dt$index):max(dt$index))

key_map_values = dt[key_dt]

window_size = 5L
window_dt = key_map_values[, list(window_index = seq(index-window_size, index-1L, by=1L)), by=keys]

setkeyv(window_dt, c(keys[1], "window_index")) ## note the join here is on "event, window"
setkeyv(key_map_values, keys)

ans = key_map_values[window_dt]

q_mov_avg = ans[, mean(var1, na.rm=TRUE) * (!any(index < 1) | NA), by="unit,i.index"]
q_mov_tot = ans[, sum(var1, na.rm=TRUE) * (!any(index < 1) | NA), by="unit,i.index"]
q_mov_max = ans[, max(var1, na.rm=TRUE) * (!any(index < 1) | NA), by="unit,i.index"]
q_mov_min = ans[, min(var1, na.rm=TRUE) * (!any(index < 1) | NA), by="unit,i.index"]

q_all_step_1 <- merge(q_mov_avg,q_mov_max,by=c("unit","i.index"))
setnames(q_all_step_1,"V1.x","avg_var1")
setnames(q_all_step_1,"V1.y","max_var1")
q_all_step_2 <- merge(q_all_step_1,q_mov_tot,by=c("unit","i.index"))
setnames(q_all_step_2,"V1","tot_var1")
q_all <- merge(q_all_step_2,q_mov_min,by=c("unit","i.index"))
setnames(q_all,"V1","min_var1")
setnames(q_all,"i.index","index")
setkey(q_all,unit,index)
dt$src="orig"
q_all$src="window"

dt_melt <- melt(dt,id=c(1:2,4),measure=3)
q_all_melt <- melt(q_all,id=c(1:2,7),measure=3:6)
dt_q_melt<-rbind(dt_melt,q_all_melt)
dt_q_melt[src=="window"]$index <- dt_q_melt[src=="window"]$index - window_size/2
ggplot(data=dt_q_melt[unit==1,],aes(x=index,y=value,group=variable,color=variable)) +
  geom_line() +
  geom_point()

当数据样本有更多列时,如何将代码修改为在上面将FALSE设置为TRUE时运行?

r data.table multiple-columns sliding-window
1个回答
0
投票

以下代码使用滑动窗口中的行评估数据表多列上的用户定义函数。有两个键,第一个键用于对数据进行分组,第二个键是滑动窗口的方向。移动窗口的结果与数据一起绘制。

#
# Generate sample data
#
library(data.table)
library(ggplot2)
library(reshape2)
fakeData<-function() {

  set.seed(103)
  # Little dataset
  min_nSamples=30
  max_nSamples=50
  nVar = 10
  nUnits=30
  if(FALSE) {
    # Big dataset
    min_nSamples=80
    max_nSamples=120
    nVar = 130
    nUnits=330
  } 

  repN<-function(un) {
    rep(un,sample(min_nSamples:max_nSamples,1))
  }
  units<- unlist(lapply(1:nUnits,repN))
  dt<-data.table(unit = units)
  dt <- dt[,{
    ## if there's just one row in the group of ID's, return nothing
    list(
      index = as.double(seq(1,.N))
    )}, by = c("unit")]
  for (iv in seq(1,nVar)) {
    vname <- paste("var_",iv,sep="")
    expr <- bquote(.(as.name(vname)):=runif(nrow(dt), -5, 5))
    dt[, eval(expr)]  
  }
  setkeyv(dt,c("unit","index"))
  return(dt)
}
dt<-fakeData()
(dim(dt))

# Get a complete map of units / indices
keys = key(dt)
key_dt = dt[, ..keys]
setkey(key_dt)

key_map_values = dt[key_dt]

window_size = 10L
fmla_txt = parse(text=paste("list(window_index = seq(", keys[2], "-window_size, ",
                            keys[2],"-1L, by=1L))",sep=""))
key_map_values[is.na(get(keys[2])), (keys[2]):=-1 ] # Remove NA's
window_dt = key_map_values[, eval(fmla_txt), by=keys]

setkeyv(window_dt, c(keys[1], "window_index")) ## note the join here is on "event, window"
setkeyv(key_map_values, keys)

ans = key_map_values[window_dt]

mov_mean <- function(v,indx) {
  mean(v, na.rm=TRUE) * (!any(indx < 1) | NA)
}
mov_total <- function(v,indx) {
  sum(v, na.rm=TRUE) * (!any(indx < 1) | NA)
}
mov_min <- function(v,indx) {
  max(v, na.rm=TRUE) * (!any(indx < 1) | NA)
}
mov_max <- function(v,indx) {
  min(v, na.rm=TRUE) * (!any(indx < 1) | NA)
}


# q_mov_avg = ans[, lapply(.SD, mov_mean, index), by="unit,i.index"]
#
fmla_txt = parse(text=paste("lapply(.SD, mov_mean, ", keys[2], ")",sep=""))
ikey2=paste("i.",keys[2],sep="")
klist=c(keys[1],ikey2)
q_mov_avg = ans[, eval(fmla_txt), by=klist]
fmla_txt = parse(text=paste("lapply(.SD, mov_total, ", keys[2], ")",sep=""))
q_mov_tot = ans[, eval(fmla_txt), by=klist]
fmla_txt = parse(text=paste("lapply(.SD, mov_max, ", keys[2], ")",sep=""))
q_mov_max = ans[, eval(fmla_txt), by=klist]
fmla_txt = parse(text=paste("lapply(.SD, mov_min, ", keys[2], ")",sep=""))
q_mov_min = ans[, eval(fmla_txt), by=klist]
# 
change_col_names <- function(dt,cname) {
  namesToChange <- names(dt)[grep(paste(keys,collapse="|"),names(dt),invert=TRUE)]
  chgName <- function(n) {
    setnames(dt,n,paste(cname,n,sep="."))
  }
  lapply(namesToChange,chgName)

  # dt$index<-NULL
  fmla_txt = parse(text=paste(keys[2],":=NULL", sep=""))
  dt[,eval(fmla_txt)]
  setnames(dt,ikey2,keys[2])
  setkeyv(dt,keys)
  return(dt)
}
q_mov_avg <- change_col_names(q_mov_avg,"avg")
q_mov_tot <- change_col_names(q_mov_tot,"sum")
q_mov_max <- change_col_names(q_mov_max,"max")
q_mov_min <- change_col_names(q_mov_min,"min")
dt_all <- Reduce(merge,list(dt,q_mov_avg,q_mov_tot,q_mov_max,q_mov_min))
# dt_all$window_index <- dt_all$index - window_size/2
dt_all$window_index <- dt_all[,keys[2],with=FALSE] - window_size/2
# 
nplots=3
fmla_txt = parse(text=paste("punit = sort(sample(unique(dt_all[,",keys[1],"]),nplots))", sep=""))
eval(fmla_txt)
for (idx_key1 in punit) {
  vname <- sample(names(dt)[!(names(dt) %in% key(dt))],1)
  fmla_txt = parse(text=paste("dt_all[",keys[1],"==idx_key1,]", sep=""))
  pdt <- melt(eval(fmla_txt),id=c("index"),
              measure=c(vname,paste("max",vname,sep="."),
                        paste("min",vname,sep="."),
                        paste("avg",vname,sep="."),
                        paste("sum",vname,sep=".")))
  pdt[variable==vname,"variable"] = "Data"
  pdt[variable==paste("max",vname,sep="."),"variable"] = "Max"
  pdt[variable==paste("min",vname,sep="."),"variable"] = "Min"
  pdt[variable==paste("avg",vname,sep="."),"variable"] = "Avg"
  pdt[variable==paste("sum",vname,sep="."),"variable"] = "Sum"
  fmla_txt = parse(text=paste("dt_all[",keys[1],"==idx_key1,",keys[2],"]", sep=""))
  pdt[variable=="Data","x"]=eval(fmla_txt)
  fmla_txt = parse(text=paste("dt_all[",keys[1],"==idx_key1,window_index]", sep=""))
  pdt[variable!="Data","x"]=eval(fmla_txt)
  gp<-ggplot(data=pdt,aes(x=x,y=value,group=variable,color=variable)) +
    geom_line(size=1.1) +
    labs(title=paste(keys[1],idx_key1),y=vname)+theme(legend.title=element_blank())
  print(gp)
}

<< img src =“ https://image.soinside.com/eyJ1cmwiOiAiaHR0cHM6Ly9pLnN0YWNrLmltZ3VyLmNvbS91clZWdy5wbmcifQ==” alt =“在此处输入图像说明”>“ >>

© www.soinside.com 2019 - 2024. All rights reserved.