我有一个包含多边形和地区名称的表格;我还有具有精确经度和纬度的购买数据。我编写了一个函数来检查多边形中的每个坐标对是否匹配;然后它为购买分配地区名称。问题是,由于缺乏矢量化和嵌套 for 循环,它的运行速度非常非常慢(谢谢,pandas)。我如何优化才能在更短的时间内消化 10+ 百万行?
def get_district_name(geo_df: pd.DataFrame, ship_df: pd.DataFrame, col_name: str, frac: int=0.65) -> pd.DataFrame:
sample_ship = ship_df.sample(frac=frac, replace=False, random_state=42).reset_index(drop=True)
sample_ship['municipal_district_name'] = ''
for i in tqdm(range(len(sample_ship))):
point = shapely.geometry.Point(sample_ship['address_longitude'][i], sample_ship['address_latitude'][i])
for j in range(len(geo_df)):
if point.within(geo_df.geometry[j]):
sample_ship['municipal_district_name'][i] = geo_df[col_name][j]
continue
return sample_ship
您可以使用
within
/sjoin
但请注意,在 10+M 上进行空间连接不会很快。
def get_district_name(
geo_df: gpd.GeoDataFrame,
ship_df: pd.DataFrame,
col_name: str,
frac: float = 0.65,
) -> pd.DataFrame:
ship_sample_df = ship_df.sample(
frac=frac, replace=False, random_state=42,
)
sample_ship_gdf = gpd.GeoDataFrame(
ship_sample_df,
geometry=gpd.points_from_xy(
ship_sample_df["address_longitude"],
ship_sample_df["address_latitude"],
),
)
return gpd.sjoin(
sample_ship_gdf,
geo_df,
predicate="within",
how="left",
)[list(ship_df) + [col_name]]
输出:
>>> get_district_name(gdf_districts, df_purchases, "municipal_district_name", 1)
address_longitude address_latitude municipal_district_name
1 1.00 2.00 district_2
4 1.50 1.60 district_2
2 0.70 3.00 NaN
0 3.00 2.50 NaN
3 0.20 0.30 district_1
[5 rows x 3 columns]