我正在使用 geopandas,为了简化问题,我有一个可能包含很多行的地理数据框。这些行代表在 epsg:4326 投影中投影在地面上的正方形,并且可以代表特定区域的网格。
所有方块都具有相同的大小,我想根据特定的输入值将它们按 4 × 4 或 9 × 9 分组。
这里有一个例子:
import shapely
import geopandas as gpd
a0 = shapely.geometry.Point([0, 0]).buffer(0.5, cap_style = 'square')
a1 = shapely.affinity.translate(a0, xoff=1)
a2 = shapely.affinity.translate(a0, xoff=2)
a3 = shapely.affinity.translate(a0, xoff=3)
a4 = shapely.affinity.translate(a0, xoff=4)
b0 = shapely.affinity.translate(a0, yoff=1)
b1 = shapely.affinity.translate(b0, xoff=1)
b2 = shapely.affinity.translate(b0, xoff=2)
b3 = shapely.affinity.translate(b0, xoff=3)
b4 = shapely.affinity.translate(b0, xoff=4)
c0 = shapely.affinity.translate(b0, yoff=1)
c1 = shapely.affinity.translate(c0, xoff=1)
c2 = shapely.affinity.translate(c0, xoff=2)
c3 = shapely.affinity.translate(c0, xoff=3)
liste_geo = [a0, a1, a2, a3, a4, b0, b1, b2, b3, b4, c0, c1, c2, c3]
column = [1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4]
row = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2]
gdf = gpd.GeoDataFrame({'column' : column, 'row' : row, 'geometry' : liste_geo},
crs = 'epsg:4326',
index = range(len(liste_geo)))
gdf.plot(facecolor = 'none')
我的网格由 gdf 变量表示,我得到了具有特定大小的正方形。如果我的输入变量是 2,我想将 4 个正方形分组为红色。在这个示例中,我将获得一个包含 7 个新正方形的地理数据框。如果我的输入变量是 3,我将得到 2 个新方块的 gdf,右侧的 2 个将丢失。
你有什么想法吗?
非常感谢。
我制作了第一个版本的函数来解决这个问题,但是使用大型 gdf(~ 100k 行)进行计算太长了
这是代码
def define_spot_grid(gdf_base_grid, size_base, size_swath):
"""
gdf_base_grid : our initial grid
size_base : size of the initial square
size_swath : size of our bigger square
"""
rapport = size_swath/size_base
unique_column = gdf_base_grid['column'].unique()
unique_row = gdf_base_grid['row'].unique()
liste_bigger_spot = []
for k, v in gdf_base_grid.sort_values(by = ['row', 'column']).iterrows():
# actually for every square i have a row and column information
current_row, current_column = v['row'], v['column']
bound_row, bound_column = v['row'] + rapport - 1, v['column'] + rapport - 1
spot_bl = gdf_base_grid[(gdf_base_grid['row'] == current_row) & (gdf_base_grid['column'] == current_column)]
spot_tl = gdf_base_grid[(gdf_base_grid['row'] == bound_row) & (gdf_base_grid['column'] == current_column)]
spot_tr = gdf_base_grid[(gdf_base_grid['row'] == bound_row) & (gdf_base_grid['column'] == bound_column)]
spot_br = gdf_base_grid[(gdf_base_grid['row'] == current_row) & (gdf_base_grid['column'] == bound_column)]
# checking if all these squares exist
if (not spot_bl.empty) and (not spot_tl.empty) and (not spot_tr.empty) and (not spot_br.empty):
point_bl = spot_bl.unary_union.centroid
point_tl = spot_tl.unary_union.centroid
point_tr = spot_tr.unary_union.centroid
point_br = spot_br.unary_union.centroid
# we define a spot with 4 points
bigger_spot = shapely.geometry.Polygon([point_bl, point_tl, point_tr, point_br])
gdf_bigger_spot = gpd.GeoDataFrame({'geometry' : bigger_spot}, crs = 'epsg:4326', index = [0])
# we check how many spot our bigger spot intersects using a spatial join
sjoin = gpd.sjoin(gdf_bigger_spot, gdf_base_grid, how = 'right', predicate = 'intersects').dropna()
# example : if rapport = 3 we need to get 9 squares, if it is less we do not keep
if len(sjoin) == rapport**2:
liste_bigger_spot.append(sjoin.unary_union)
gdf_bigger = gpd.GeoDataFrame({'geometry' : liste_bigger_spot}, crs = 'epsg:4326', index = range(len(liste_bigger_spot)))
return gdf_bigger
在 python 中循环大型数据帧往往很慢。如果每个循环中都涉及 sjoin 这样的空间操作,这种效果往往会变得更糟。
因此,如果可能的话,如果性能很重要,应该避免此类循环。
以下代码片段说明了如何在不使用通用循环的情况下实现相同的逻辑:
from matplotlib import pyplot as plt
import numpy as np
import shapely
import geopandas as gpd
# Init parameters
size_base = 1
rapport = 2
# Create base grid
a0 = shapely.geometry.Point([0, 0]).buffer(size_base / 2, cap_style="square")
a1 = shapely.affinity.translate(a0, xoff=1)
a2 = shapely.affinity.translate(a0, xoff=2)
a3 = shapely.affinity.translate(a0, xoff=3)
a4 = shapely.affinity.translate(a0, xoff=4)
b0 = shapely.affinity.translate(a0, yoff=1)
b1 = shapely.affinity.translate(b0, xoff=1)
b2 = shapely.affinity.translate(b0, xoff=2)
b3 = shapely.affinity.translate(b0, xoff=3)
b4 = shapely.affinity.translate(b0, xoff=4)
c0 = shapely.affinity.translate(b0, yoff=1)
c1 = shapely.affinity.translate(c0, xoff=1)
c2 = shapely.affinity.translate(c0, xoff=2)
c3 = shapely.affinity.translate(c0, xoff=3)
liste_geo = [a0, a1, a2, a3, a4, b0, b1, b2, b3, b4, c0, c1, c2, c3]
column = [1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4]
row = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2]
base_grid_gdf = gpd.GeoDataFrame(
{"column": column, "row": row, "geometry": liste_geo},
crs="epsg:4326",
index=range(len(liste_geo)),
)
# Create grid with bigger spots, covering the bounds of the base grid
size_swath = size_base * rapport
bounds = base_grid_gdf.total_bounds
bigger_spots = []
for xmin in np.arange(bounds[0], bounds[2], size_swath):
for ymin in np.arange(bounds[1], bounds[3], size_swath):
bigger_spots.append(
shapely.geometry.box(xmin, ymin, xmin + size_swath, ymin + size_swath)
)
bigger_spots_gdf = gpd.GeoDataFrame(geometry=bigger_spots, crs=base_grid_gdf.crs)
# Check how many small spots our bigger spots intersect
# Join with centroids of base grid to avoid spots that only touch being counted
base_grid_centroid_gdf = gpd.GeoDataFrame(
geometry=base_grid_gdf.centroid, crs=base_grid_gdf.crs
)
sjoin_gdf = bigger_spots_gdf.sjoin(
base_grid_centroid_gdf, how="inner", predicate="intersects"
)
number_small_in_bigger_df = sjoin_gdf[["index_right"]].groupby(level=0).count()
# Only keep the bigger spots that have the minimum needed number small ones in them
indices_to_keep = number_small_in_bigger_df[
number_small_in_bigger_df.index_right >= rapport**2
].index.tolist()
bigger_gdf = bigger_spots_gdf.loc[indices_to_keep]
# Plot result
f, ax = plt.subplots()
base_grid_gdf.plot(ax=ax, facecolor="none")
bigger_gdf.plot(ax=ax, facecolor="none", edgecolor="red", linewidth=2)
plt.show()
结果: