我有一个for loop
,它在每次迭代后都会产生一个数据帧。我想将所有数据帧附加在一起,但发现困难。以下是我的正在尝试,请提出解决方案的建议:
d = NULL
for (i in 1:7) {
# vector output
model <- #some processing
# add vector to a dataframe
df <- data.frame(model)
}
df_total <- rbind(d,df)
请勿在循环内执行。列出清单,然后将其合并到循环之外。
datalist = list()
for (i in 1:5) {
# ... make some data
dat <- data.frame(x = rnorm(10), y = runif(10))
dat$i <- i # maybe you want to keep track of which iteration produced it?
datalist[[i]] <- dat # add it to your list
}
big_data = do.call(rbind, datalist)
# or big_data <- dplyr::bind_rows(datalist)
# or big_data <- data.table::rbindlist(datalist)
这是一种更像R的方式。它也可以快得多,特别是如果您使用dplyr::bind_rows
或data.table::rbindlist
进行数据帧的最终组合时。
您应该尝试此:
df_total = data.frame()
for (i in 1:7){
# vector output
model <- #some processing
# add vector to a dataframe
df <- data.frame(model)
df_total <- rbind(df_total,df)
}
再次确认是正确的,但是要使其正常工作,首先要使用已经至少有一列的数据框
model <- #some processing
df <- data.frame(col1=model)
for (i in 2:17)
{
model <- # some processing
nextcol <- data.frame(model)
colnames(nextcol) <- c(paste("col", i, sep="")) # rename the comlum
df <- cbind(df, nextcol)
}
在Coursera课程,R编程入门中,测试了此技能。他们给所有学生332个单独的csv文件,并要求他们以编程方式组合几个文件,以计算污染物的平均值。
这是我的解决方案:
# create your empty dataframe so you can append to it.
combined_df <- data.frame(Date=as.Date(character()),
Sulfate=double(),
Nitrate=double(),
ID=integer())
# for loop for the range of documents to combine
for(i in min(id): max(id)) {
# using sprintf to add on leading zeros as the file names had leading zeros
read <- read.csv(paste(getwd(),"/",directory, "/",sprintf("%03d", i),".csv", sep=""))
# in your loop, add the files that you read to the combined_df
combined_df <- rbind(combined_df, read)
}
尝试使用rbindlist
方法而非rbind
,因为它非常非常快。
示例:
library(data.table)
##### example 1: slow processing ######
table.1 <- data.frame(x = NA, y = NA)
time.taken <- 0
for( i in 1:100) {
start.time = Sys.time()
x <- rnorm(100)
y <- x/2 +x/3
z <- cbind.data.frame(x = x, y = y)
table.1 <- rbind(table.1, z)
end.time <- Sys.time()
time.taken <- (end.time - start.time) + time.taken
}
print(time.taken)
> Time difference of 0.1637917 secs
####example 2: faster processing #####
table.2 <- list()
t0 <- 0
for( i in 1:100) {
s0 = Sys.time()
x <- rnorm(100)
y <- x/2 + x/3
z <- cbind.data.frame(x = x, y = y)
table.2[[i]] <- z
e0 <- Sys.time()
t0 <- (e0 - s0) + t0
}
s1 = Sys.time()
table.3 <- rbindlist(table.2)
e1 = Sys.time()
t1 <- (e1-s1) + t0
t1
> Time difference of 0.03064394 secs
这里有一些tidyverse
和自定义功能选项可能会根据您的需要起作用:
library(tidyverse)
# custom function to generate, filter, and mutate the data:
combine_dfs <- function(i){
data_frame(x = rnorm(5), y = runif(5)) %>%
filter(x < y) %>%
mutate(x_plus_y = x + y) %>%
mutate(i = i)
}
df <- 1:5 %>% map_df(~combine_dfs(.))
df <- map_df(1:5, ~combine_dfs(.)) # both give the same results
> df %>% head()
# A tibble: 6 x 4
x y x_plus_y i
<dbl> <dbl> <dbl> <int>
1 -0.973 0.673 -0.300 1
2 -0.553 0.0463 -0.507 1
3 0.250 0.716 0.967 2
4 -0.745 0.0640 -0.681 2
5 -0.736 0.228 -0.508 2
6 -0.365 0.496 0.131 3
如果您有需要合并的文件目录,则可以执行类似的操作:
dir_path <- '/path/to/data/test_directory/'
list.files(dir_path)
combine_files <- function(path, file){
read_csv(paste0(path, file)) %>%
filter(a < b) %>%
mutate(a_plus_b = a + b) %>%
mutate(file_name = file)
}
df <- list.files(dir_path, '\\.csv$') %>%
map_df(~combine_files(dir_path, .))
# or if you have Excel files, using the readxl package:
combine_xl_files <- function(path, file){
readxl::read_xlsx(paste0(path, file)) %>%
filter(a < b) %>%
mutate(a_plus_b = a + b) %>%
mutate(file_name = file)
}
df <- list.files(dir_path, '\\.xlsx$') %>%
map_df(~combine_xl_files(dir_path, .))