我有很多大型数据框(+500,000行),这些数据框中的日期时间信息存储在多列中。它的格式不是MMDDYYYY,而是一列是年,下一列是朱利安日历日,第三列是时间。数据的结构是这样的。
df<-data.frame(YEAR = sample(2000:2020,10000, replace=T),
JULIEN = sample(1:365,10000,replace=T),
Time = sample(0:59,10000,replace = T),
dataVar1 = runif(10000,1.0,10.0),
dataVar2 = runif(10000,20.0,100.0))
到目前为止,我一直在用这个方法来处理:
timeR<-vector()
for (i in 1:dim(df)[1]){
currentTime<-paste(as.Date(df$JULIEN[i], origin=paste(df$YEAR[i]-1,"-12-31", sep = "")),formatC(df$Time[i], width = 4, format = "d", flag = "0"))
timeR<-c(timeR,currentTime)
}
df<-cbind(timeR,df[, ! names(df) %in% c("YEAR","JULIEN","Time")])
df$timeR<-as.POSIXct(df$timeR,format = "%Y-%m-%d %H%M", tz = "EST")
rm(timeR,i,currentTime)
但这需要大量的时间。有什么办法可以让它运行得更快吗?谢谢你。
paste
和 as.Date
矢量化
v1 <- as.Date(df$JULIEN, origin = paste0(df$YEAR-1,"-12-31"))
currentTime <- paste(v1, formatC(df$Time, width = 4, format = "d", flag = "0"))
system.time({
timeR<-vector()
for (i in 1:dim(df)[1]){
currentTime<-paste(as.Date(df$JULIEN[i], origin=paste(df$YEAR[i]-1,"-12-31", sep = "")),formatC(df$Time[i], width = 4, format = "d", flag = "0"))
timeR<-c(timeR,currentTime)
}})
# user system elapsed #
# 1.300 0.061 1.366
system.time({
v1 <- as.Date(df$JULIEN, origin = paste0(df$YEAR-1,"-12-31"))
currentTime <- paste(v1, formatC(df$Time, width = 4, format = "d", flag = "0"))
})
# user system elapsed
# 0.076 0.004 0.080