剪辑 netCDF 文件时出现问题

Question

我正在尝试进行概念验证，同时我正在从 NOAA 获取一个 netCDF 文件，该文件包含特定年份的全球降水数据，并将数据“重新分配”到美国各州级别。我总是遇到执行剪辑后没有数据的情况。

使用来自 https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_500k.zip 的形状文件，该文件将所有状态保存在一个文件中。

使用 ftp://ftp.cdc.noaa.gov/Datasets/cpc_global_precip/precip.1988.nc.

中的 netCDF 文件

这是我的代码。

#!/usr/bin/python

import xarray
import geopandas
import glob
import os, sys, getopt
from shapely.geometry import mapping

def partition(shapeFile, ncDirectory, outDirectory, stateName):
    # Load the shape file that either has all the states or just one named state
    sf_all = geopandas.read_file(shapeFile)

    # Filter to our state
    sf = sf_all.loc[sf_all['NAME'] == stateName]

    # Get all the nc files in the ncDirectory so we can repartition all the models
    file_list = sorted(glob.glob(f'{ncDirectory}/*.nc'))

    for file in file_list:
        print(f'Working on: {outDirectory}/{stateName}/{os.path.basename(file)}')

        # Open the file and do some basics
        nc_file = xarray.open_dataset(file)
        nc_file.rio.write_crs("epsg:4326", inplace=True)
        nc_file.rio.set_spatial_dims(x_dim="lon", y_dim="lat", inplace=True)

        # Convert lon from 0,360 to -180,180 if needed
        if any(nc_file.coords['lon'] > 180 ):
            nc_file.coords['lon'] = (nc_file.coords['lon'] + 180) % 360 - 180
        
        # Convert lat from 0,360 to -180,180 if needed
        if any(nc_file.coords['lat'] > 180 ):
            nc_file.coords['lat'] = (nc_file.coords['lat'] + 180) % 360 - 180

        # Perform the actual clipping here based on the geometry of the {sf} variable which should just be our state
        clipped_nc = nc_file.rio.clip(sf.geometry.apply(mapping), crs="epsg:4326", drop=True)

        if os.path.exists(f'{outDirectory}/{stateName}/{os.path.basename(file)}'):
            os.removedirs(f'{outDirectory}/{stateName}/{os.path.basename(file)}')
        
        # Save the file to a new netCDF representing the state/model combination
        clipped_nc.to_netcdf(f'{outDirectory}/{stateName}/{os.path.basename(file)}')

def main(argv):
    shapeFile = ''
    stateName = ''
    ncDirectory = ''
    outDirectory = ''

    opt, args = getopt.getopt(argv,"hf:n:o:s:",["shapedir=","ncdirectory=","outputdir=", "statename="])
    for opt, arg in opt:
        if opt =='-h':
            print('repartition.py -f <shapefile> -n <ncfiledirectory> -o <outputdir> -s <statename>')
            quit(0)
        elif opt in ("-f", "--shapefile"):
            shapeFile = arg.rstrip("/")
        elif opt in ("-n", "--ncdirectory"):
            ncDirectory = arg.rstrip("/")
        elif opt in ("-o", "--outputdir"):
            outDirectory = arg.rstrip("/")
        elif opt in ("-s","--statenames"):
            stateName = arg
        
    isExists = os.path.exists(f'{outDirectory}/{stateName}')
    if not isExists:
        os.makedirs(f'{outDirectory}/{stateName}')
    
    partition(shapeFile,ncDirectory,outDirectory, stateName)

if __name__ == "__main__":
    main(sys.argv[1:])

我通常不是Python用户，实际上我更喜欢C/C++/C#，但是Python是必需的，所以用它来运行。其他要求是这是 Windows 环境（不是我的选择）。也许有一天我会说服当局，运行 Linux 容器并不是一件坏事，并且可以在 Azure 中完成，这样我们就可以继续成为“微软商店”，但现在，Windows 就是了。

Answer 1

问题在于经度从[0, 360]范围到[–180, 180]的转换。经度必须按升序排序，但将 180° 以上的经度转换为负值后，它们不再排序。相反，转换后，经度从 0 变为 180，然后从 –180 变为 –0。在这种情况下，裁剪似乎不起作用。

由于美国大陆处于负经度，一个简单的解决方法是删除经度 0 到 +180 之间的所有内容。那么只剩下负经度，从 –180 增加到 0。这可以通过在使用

xarray

打开数据集后添加以下行来完成：

nc_file = nc_file.where(nc_file.lon > 180, drop=True)

在这种情况下，您可以简单地通过以下方式转换为所需范围内的经度，而不是使用模运算符

进行计算：

nc_file["lon"] = nc_file.lon - 360

我建议删除纬度的转换，因为它们总是给出 –90° 和 +90° 之间的值。

因此，一个最小的工作示例（MWE）可能是：

import xarray
import geopandas
from shapely.geometry import mapping

stateName = "Alaska"

sf_all = geopandas.read_file("cb_2018_us_state_500k/cb_2018_us_state_500k.shp")
sf = sf_all.loc[sf_all["NAME"] == stateName]

nc_file = xarray.open_dataset("precip.1988.nc")

# Keep only longitudes between 180° and 360°
nc_file = nc_file.where(nc_file.lon > 180, drop=True)
# Convert longitudes from 180,360 to -180,0
nc_file["lon"] = nc_file.lon - 360

nc_file.rio.write_crs("epsg:4326", inplace=True)
nc_file.rio.set_spatial_dims(x_dim="lon", y_dim="lat", inplace=True)
clipped_nc = nc_file.rio.clip(sf.geometry.apply(mapping), crs="epsg:4326", drop=True)

# Show the result
clipped_nc.precip.mean("time").plot()
import matplotlib.pyplot as plt; plt.show()

希望这有帮助。

Answer 2

这就是最终版本。我认为度假是这一切的关键。我可能会考虑的一件事是在进入循环之前将数据重新网格化为更精细的分辨率（从 0.25 到 1/16），以便我可以在基于多边形的输出中保留更多信息。现在，0.25 在输出中丢失了很多，除非有人知道如何包含仅部分包含在形状中的网格点而不是当前完全包含的方法。无论如何，这实际上都是概念验证代码，所以我自己可能不会更进一步，但很高兴看到以这种方式做事，而不是使用多维数据集并手动将数据暴力强制到最终结果中。现在我正在考虑，我要添加的唯一一件事就是从文件中删除所有闰日，因为这是一个要求。无论如何，废话已经够多了，我要感谢大家对此的帮助。我不仅要学习一些新的 python，还必须真正了解 netCDF、CRS 和所有这些爵士乐，所以至少，我在短时间内学到了很多新东西。

import xarray as xr
import dask
import time
import geopandas as gpd
import sys, getopt
from shapely.geometry import mapping
from pathlib import Path

if __name__ == "__main__":
    argv = sys.argv[1:]

    shapeFile = ''
    stateName = ''
    ncFileglob = ''
    ncFileDir = ''
    outDirectory = ''
    ncFileOutPartName = ''

    opt, args = getopt.getopt(argv,"hf:g:n:o:s:",["shapedir=","ncFileDir=", "ncFileglob=","outputdir=", "statename="])
    for opt, arg in opt:
        if opt =='-h':
            print('repartition.py -f <shapefile> -n <ncFileDir> -g <ncFileglob> -o <outputdir> -s <statename>')
            quit(0)
        elif opt in ("-f", "--shapefile"):
            shapeFile = arg.rstrip("/")
        elif opt in ("-g", "--ncfileglob"):
            ncFileglob = arg.rstrip("/")
        elif opt in ("-o", "--outputdir"):
            outDirectory = arg.rstrip("/")
        elif opt in ("-s","--statename"):
            stateName = arg
        elif opt in ("-n","--ncFileDir"):
            ncFileDir = arg.rstrip("/")
    
    # Open the files and do some basics
    nc_file = xr.open_mfdataset(f'{ncFileDir}/{ncFileglob}', chunks={'time':365,'lat':100,'lon':100},parallel=True,use_cftime=False,decode_times=False)

    # Load the shape file that either has all the states or just one named state
    sf_all = gpd.read_file(shapeFile)

    # Convert lon from 0,360 to -180,180 if needed
    if any(nc_file.coords['lon'] > 180 ):
        nc_file.coords['lon'] = (nc_file.coords['lon'] + 180) % 360 - 180

    # Resort the lon dimension so that it starts at -180 and increases
    nc_file = nc_file.sortby(nc_file.lon)

    # Set the dataset CRS and signify the x,y dimension labels
    nc_file.rio.write_crs("epsg:4326", inplace=True)
    nc_file.rio.set_spatial_dims(x_dim="lon", y_dim="lat", inplace=True)

    # Get the first and last years for part of the filename
    # Extract the time dimension
    time_data = nc_file['time']

    # Find the first and last time values
    first_time = time_data.min().values
    last_time = time_data.max().values

    # And finally Convert time values to string representations of years
    first_year = first_time.astype('datetime64[Y]').astype(str)
    last_year = last_time.astype('datetime64[Y]').astype(str)

    # Start building the later part of the filename
    ncFileOutPartName = f"{nc_file.attrs['cmip6_source_id']}_{nc_file.attrs['scenerio']}_{nc_file.attrs['variant_label']}_{first_year}-{last_year}"

    # Record the start of processing all the polygons
    overall_start_time = time.time()

    # Loop through all the shapes in the shapefile
    for index, row in sf_all.iterrows():
        polygon_name = row['NAME']
        print(f'Working on {polygon_name}', end='')
        
        sf = sf_all.loc[sf_all["NAME"] == polygon_name]
        try:   
            # Record the start time of the code run for this polygon
            start_time = time.time()             
            # Perform the actual clipping here based on the geometry of the {sf} variable which should just be our state
            clipped_nc = nc_file.rio.clip(sf.geometry.apply(mapping), crs="epsg:4326", drop=True)

            # Build the final filename
            nc_repartitioned_file = f'{outDirectory}/{polygon_name}_{ncFileOutPartName}.nc'

            # Write the file out for the polygon
            clipped_nc.to_netcdf(nc_repartitioned_file)

            # Record the start time of the code run for this polygon
            end_time = time.time()

            # Calculate the elapsed time
            elapsed_time = round((end_time - start_time) / 60,2)

            print(f' : Completed in {elapsed_time} minutes')

        except Exception as error:
            print(f'  there was a problem')
            print("The error was:", error)

    # Record the start of processing all the polygons
    overall_end_time = time.time()

    # Calculate the elapsed time of processing all the polygons
    overall_elapsed_time = round((overall_end_time - overall_start_time) / 60,2)

    # Get the total number of polygons we had to deal with
    total_polygons = sf_all['NAME'].count()

    # now report everything
    print(f'The run for {total_polygons} polygons took {overall_elapsed_time} minutes to complete')

    # Close the dataset and cleanup
    nc_file.close()

剪辑 netCDF 文件时出现问题

问题描述投票：0回答：2

2个回答

最新问题

剪辑 netCDF 文件时出现问题

问题描述 投票：0回答：2

2个回答

最新问题

问题描述投票：0回答：2