我正在使用raster和glcm软件包来计算卫星图像上的Haralick纹理特征。我已经使用单个内核成功运行了glcm()函数,但是正在并行运行它。这是我正在使用的代码:
# tiles is a list of raster extents, r is a raster
registerDoParallel(7)
out_raster = foreach(i=1:length(tiles),.combine = merge,.packages=c("raster","glcm")) %dopar%
glcm(crop(r,tiles[[i]]), n_grey=16, window=c(17,17), shift = c(1,1),
min_x = rmin, max_x = rmax)
[当我检查创建的临时文件时,似乎合并的每个步骤都会创建一个临时文件,这会占用大量硬盘空间。这是整体图片(2GB):
这是两个临时文件:Merge Step 1 Merge Step 2
因为每个图块的glcm函数输出为3 GB,所以为每个逐步合并操作创建一个临时文件都会创建约160GB的临时栅格文件。有没有更节省空间的方法可以并行运行此程序?
我设法通过使用gdal和构建vrts来节省硬盘空间。下面是我编写的在glcm包中的示例数据上运行的代码。步骤为1:创建磁贴的vrt文件; 2)在每个vrt磁贴上并行运行glcm函数(请参阅glcm_parallel函数); 3)将图块合并为vrt,并使用gdal warp写入输出栅格。 vrt文件非常小,唯一的临时文件就是glcm函数创建的文件。这对于大型栅格应该有很大帮助。
#Load Packages
library(raster)
library(sf)
library(rgdal)
library(glcm)
library(doParallel)
library(gdalUtils)
#Source helper functions
source("./tilebuild_buff.R")
source("./glcm_parallel.R")
#Read raster - example data from glcm package (saved to disk)
rasterfile = "./L5TSR_1986.tif"
r = raster("L5TSR_1986.tif")
#Create tiles directory if it doesn't exist and clear files if it exists
if(!file.exists("./tiles")){dir.create("./tiles")}
file.remove(list.files("./tiles/",full.names=T))
#Calculate tiles for parallel processing - returns x and y offsets, and widths
#to use with gdal_translate
jobs_buff = tilebuild_buff(r,nx=5,ny=2,buffer=c(5,5))
#Create vrt files for buffered tiles
for (i in 1:length(jobs_buff[,1])){
fin = rasterfile
fout = paste0("./tiles/t_",i,'.vrt')
ex = as.numeric(jobs_buff[i,])
gdal_utils('translate',fin,fout,options = c('-srcwin',ex,'-of','vrt'))
}
#Read in vrt files of raster tiles and set the nodata value
input.rasters = lapply(paste0("./tiles/", list.files("./tiles/",pattern="\\.vrt$")), raster)
for(i in 1:length(input.rasters)){ NAvalue(input.rasters[[i]])= -3.4E38 }
#Create a directory for temporary raster grids and clear files
tempdir = "./rastertemp/"
if(!file.exists(tempdir)){dir.create(tempdir)}
file.remove(list.files(tempdir,full.names=T))
registerDoParallel(6)
#Determine min and max values over original raster
rmin = cellStats(r,'min')
rmax = cellStats(r,'max')
#Run glcm function in parallel
glcm_split = foreach(i=1:length(jobs_buff[,1]),.packages=c("raster","glcm")) %dopar%
glcm_parallel(inlist = input.rasters,temp=tempdir,window=c(3,3),n_grey=16,
min_x=rmin,max_x=rmax)
#Get list of temp raster files created by glcm function
temps = paste0(tempdir,list.files(tempdir,pattern="\\.grd$"))
#trim off buffer (from tilebuild_buff function) and create mosaic raster
mosaic_rasters(temps,dst_dataset = "./mosaic.vrt", trim_margins = 5, srcnodata=-3.4E38,overwrite=T)
#write output tif
vrt_mosaic = "./mosaic.vrt"
outtif = "./final_merged.tif"
gdalwarp(vrt_mosaic,outtif,overwrite=T,verbose=T)
这两个辅助函数在这里:
glcm_parallel <- function(inlist, temp, n_grey=16, window=c(11,11), shift=c(1,1), min_x=NULL, max_x=NULL){
require(glcm)
#todisk option required if output rasters are small enough to fit in memory
rasterOptions(tmpdir = temp, todisk=T, maxmemory = 1E8)
## run glcm over tile
r_glcm=glcm(inlist[[i]], n_grey = n_grey, window = window, shift=shift, min_x=min_x, max_x=max_x, na_opt = 'any')
}
和这里:
tilebuild_buff <- function(r, nx=5, ny=2, buffer=c(0,0)){
round_xw = floor(ncol(r)/nx)
xsize = c(rep(round_xw,nx-1), round_xw + ncol(r)%%nx)
xoff = c(0,cumsum(rep(round_xw,nx-1)))
round_yh = floor(nrow(r)/ny)
ysize = c(rep(round_yh,ny-1), round_yh + nrow(r)%%ny)
yoff = c(0,cumsum(rep(round_yh,ny-1)))
pix_widths = expand.grid(xsize = xsize ,ysize = ysize)
offsets = expand.grid(xoff = xoff,yoff = yoff)
srcwins = cbind(offsets,pix_widths)
srcwins_buff = srcwins
#Add buffer
srcwins_buff$xoff = srcwins$xoff - buffer[1]
srcwins_buff$yoff = srcwins$yoff - buffer[2]
srcwins_buff$xsize = srcwins$xsize + 2*buffer[1]
srcwins_buff$ysize = srcwins$ysize + 2*buffer[2]
return(srcwins_buff)
}