查找 R 中与其他多边形质心一定距离内的多边形质心子集

问题描述 投票:0回答:1

我正在尝试计算牙买加 shapefile 中每个行政区 5 公里范围内行政单位的汇总统计数据。我使用 dnearneigh() 来查找这些管理单位,但随后我不确定如何最好地使用作为输出获得的列表来计算摘要统计数据。我尝试使用空间子集,但这并没有奏效,因此有关如何最好地执行此操作的建议将很有用。

shp_centroid <- st_point_on_surface(sf_communities)

shp_centroid_coord <- st_coordinates(shp_centroid)

shp_dist <- dnearneigh(shp_centroid_coord,0,5000)
subset <- sf_communities[unlist(shp_dist),]

r subset shapefile sf nearest-neighbor
1个回答
0
投票

一种选择是使用嵌套的 tibbles / data.frames,其中每个管理单元都有自己的邻居+本身的 tibble。由于

dplyr
中的行操作,从此类结构中收集汇总统计数据非常方便。始终可以选择使用
unnest()
来延长嵌套数据集并使用
group_by()
/
summarise()
来代替。

以下示例使用

nc
包中的
sf
数据集,距离阈值设置为 50km 以更好地匹配这些形状大小。它不使用
spdep
包,而是仅使用
sf
函数,这意味着我们将使用的列表(
sgbp
- 稀疏几何二进制谓词列表,标准
sf
内容)可能具有稍微不同的结构。这里邻域被定义为 50 公里缓冲区和县多边形的交集,而不是问题标题中描述的县质心;所以这可能是另一个需要更改/审查的细节。

准备示例,测试并可视化单个县的子集。
library(sf)
#> Linking to GEOS 3.9.3, GDAL 3.5.2, PROJ 8.2.1; sf_use_s2() is TRUE
library(dplyr, warn.conflicts = FALSE)
library(purrr)
library(tidyr)
library(ggplot2)

# example data from sf package examples
nc = read_sf(system.file("shape/nc.shp", package="sf")) %>% 
  select(NAME, AREA, starts_with("BIR"))
nc
#> Simple feature collection with 100 features and 4 fields
#> Geometry type: MULTIPOLYGON
#> Dimension:     XY
#> Bounding box:  xmin: -84.32385 ymin: 33.88199 xmax: -75.45698 ymax: 36.58965
#> Geodetic CRS:  NAD27
#> # A tibble: 100 × 5
#>    NAME         AREA BIR74 BIR79                                        geometry
#>    <chr>       <dbl> <dbl> <dbl>                              <MULTIPOLYGON [°]>
#>  1 Ashe        0.114  1091  1364 (((-81.47276 36.23436, -81.54084 36.27251, -81…
#>  2 Alleghany   0.061   487   542 (((-81.23989 36.36536, -81.24069 36.37942, -81…
#>  3 Surry       0.143  3188  3616 (((-80.45634 36.24256, -80.47639 36.25473, -80…
#>  4 Currituck   0.07    508   830 (((-76.00897 36.3196, -76.01735 36.33773, -76.…
#>  5 Northampton 0.153  1421  1606 (((-77.21767 36.24098, -77.23461 36.2146, -77.…
#>  6 Hertford    0.097  1452  1838 (((-76.74506 36.23392, -76.98069 36.23024, -76…
#>  7 Camden      0.062   286   350 (((-76.00897 36.3196, -75.95718 36.19377, -75.…
#>  8 Gates       0.091   420   594 (((-76.56251 36.34057, -76.60424 36.31498, -76…
#>  9 Warren      0.118   968  1190 (((-78.30876 36.26004, -78.28293 36.29188, -78…
#> 10 Stokes      0.124  1612  2038 (((-80.02567 36.25023, -80.45301 36.25709, -80…
#> # ℹ 90 more rows

# test and visualize neighbourhood subsetting for a single county (Lee)
lee = tibble::lst(
  polygon = nc[nc$NAME == "Lee","geometry"],
  centroid = st_centroid(polygon),
  buffer = st_buffer(centroid, 50000),
  within = st_filter(nc, buffer)
)

ggplot() +
  geom_sf(data = nc) +
  geom_sf(data = lee$within, fill = "green") +
  geom_sf(data = lee$polygon, fill = "red") +
  geom_sf(data = lee$buffer, alpha = .6, fill = "gold") +
  geom_sf(data = lee$centroid, size = 1) 

在数据集上应用相同的逻辑
# resulting within_dist type is sgbp, "sparse geometry binary predicate lists",
# a list of sf object row indeces that intersect with the buffer  
nc <- nc %>% 
  mutate(within_dist = st_centroid(geometry) %>% 
           st_buffer(50000) %>% 
           st_intersects(geometry)
         , .before = "geometry")
nc
#> Simple feature collection with 100 features and 5 fields
#> Geometry type: MULTIPOLYGON
#> Dimension:     XY
#> Bounding box:  xmin: -84.32385 ymin: 33.88199 xmax: -75.45698 ymax: 36.58965
#> Geodetic CRS:  NAD27
#> # A tibble: 100 × 6
#>    NAME         AREA BIR74 BIR79 within_dist                            geometry
#>  * <chr>       <dbl> <dbl> <dbl> <sgbp[,100]>                 <MULTIPOLYGON [°]>
#>  1 Ashe        0.114  1091  1364 <int [7]>    (((-81.47276 36.23436, -81.54084 …
#>  2 Alleghany   0.061   487   542 <int [7]>    (((-81.23989 36.36536, -81.24069 …
#>  3 Surry       0.143  3188  3616 <int [9]>    (((-80.45634 36.24256, -80.47639 …
#>  4 Currituck   0.07    508   830 <int [8]>    (((-76.00897 36.3196, -76.01735 3…
#>  5 Northampton 0.153  1421  1606 <int [9]>    (((-77.21767 36.24098, -77.23461 …
#>  6 Hertford    0.097  1452  1838 <int [10]>   (((-76.74506 36.23392, -76.98069 …
#>  7 Camden      0.062   286   350 <int [11]>   (((-76.00897 36.3196, -75.95718 3…
#>  8 Gates       0.091   420   594 <int [9]>    (((-76.56251 36.34057, -76.60424 …
#>  9 Warren      0.118   968  1190 <int [8]>    (((-78.30876 36.26004, -78.28293 …
#> 10 Stokes      0.124  1612  2038 <int [8]>    (((-80.02567 36.25023, -80.45301 …
#> # ℹ 90 more rows

# within_dist for Lee:
nc$within_dist[nc$NAME == "Lee"]
#> [[1]]
#>  [1] 27 29 30 37 47 48 54 60 63 67 82 86

# resolved to NAMEs :
nc$NAME[nc$within_dist[nc$NAME == "Lee"][[1]]]
#>  [1] "Alamance"   "Orange"     "Durham"     "Wake"       "Randolph"  
#>  [6] "Chatham"    "Johnston"   "Lee"        "Harnett"    "Moore"     
#> [11] "Cumberland" "Hoke"

构建嵌套小标题

# ignore geometries for now for more compact output
nc_df <- st_drop_geometry(nc)

# build a nested tibble, each county row gets a tibble of neighbours (nb),
# use rowwise grouping to subset nc_df with within_dist of current row
nc_nested <- nc_df %>% 
  rowwise() %>%
  mutate(
    nb_idx = paste(within_dist, collapse = ","),
    nb_names = paste(nc_df[["NAME"]][within_dist], collapse = ","),
    nb = (nc_df[within_dist, ]) %>% select(-within_dist) %>% list()) %>% 
  ungroup()
nc_nested
#> # A tibble: 100 × 8
#>    NAME         AREA BIR74 BIR79 within_dist  nb_idx           nb_names nb      
#>    <chr>       <dbl> <dbl> <dbl> <sgbp[,100]> <chr>            <chr>    <list>  
#>  1 Ashe        0.114  1091  1364 <int [7]>    1,2,3,18,19,22,… Ashe,Al… <tibble>
#>  2 Alleghany   0.061   487   542 <int [7]>    1,2,3,18,19,23,… Ashe,Al… <tibble>
#>  3 Surry       0.143  3188  3616 <int [9]>    1,2,3,10,18,23,… Ashe,Al… <tibble>
#>  4 Currituck   0.07    508   830 <int [8]>    4,7,8,17,20,21,… Curritu… <tibble>
#>  5 Northampton 0.153  1421  1606 <int [9]>    5,6,8,9,16,28,3… Northam… <tibble>
#>  6 Hertford    0.097  1452  1838 <int [10]>   5,6,7,8,16,17,2… Northam… <tibble>
#>  7 Camden      0.062   286   350 <int [11]>   4,6,7,8,17,20,2… Curritu… <tibble>
#>  8 Gates       0.091   420   594 <int [9]>    4,5,6,7,8,17,20… Curritu… <tibble>
#>  9 Warren      0.118   968  1190 <int [8]>    5,9,13,15,16,24… Northam… <tibble>
#> 10 Stokes      0.124  1612  2038 <int [8]>    3,10,12,23,25,2… Surry,S… <tibble>
#> # ℹ 90 more rows
使用嵌套小标题
# we can now calculate summary statistics by accessing nested tibbles in 
# nb column with purrr::map*() or lapply();
# or though rowwise grouping:

nc_nested %>% 
  select(NAME, nb_names, nb) %>% 
  rowwise() %>% 
  mutate(nb_bir74_mean = mean(nb$BIR74),
         nb_bir74_sum  =  sum(nb$BIR74),
         nb_area_sum   =  sum(nb$AREA))
#> # A tibble: 100 × 6
#> # Rowwise: 
#>    NAME        nb_names          nb       nb_bir74_mean nb_bir74_sum nb_area_sum
#>    <chr>       <chr>             <list>           <dbl>        <dbl>       <dbl>
#>  1 Ashe        Ashe,Alleghany,S… <tibble>         1946.        13625       0.784
#>  2 Alleghany   Ashe,Alleghany,S… <tibble>         2092.        14643       0.839
#>  3 Surry       Ashe,Alleghany,S… <tibble>         3111.        27997       1.06 
#>  4 Currituck   Currituck,Camden… <tibble>          607          4856       0.576
#>  5 Northampton Northampton,Hert… <tibble>         2047.        18420       1.22 
#>  6 Hertford    Northampton,Hert… <tibble>         1293.        12933       1.05 
#>  7 Camden      Currituck,Hertfo… <tibble>          784.         8622       0.953
#>  8 Gates       Currituck,Northa… <tibble>          920.         8284       0.813
#>  9 Warren      Northampton,Warr… <tibble>         2366.        18925       1.08 
#> 10 Stokes      Surry,Stokes,Roc… <tibble>         5660.        45276       0.998
#> # ℹ 90 more rows


# or we could unnest nb column to lenghten our dataset ... 
nc_unnested <- nc_nested %>% 
  unnest(nb, names_sep = ".") %>% 
  select(-(within_dist:nb_names)) 
print(nc_unnested, n = 14)
#> # A tibble: 1,004 × 8
#>    NAME       AREA BIR74 BIR79 nb.NAME   nb.AREA nb.BIR74 nb.BIR79
#>    <chr>     <dbl> <dbl> <dbl> <chr>       <dbl>    <dbl>    <dbl>
#>  1 Ashe      0.114  1091  1364 Ashe        0.114     1091     1364
#>  2 Ashe      0.114  1091  1364 Alleghany   0.061      487      542
#>  3 Ashe      0.114  1091  1364 Surry       0.143     3188     3616
#>  4 Ashe      0.114  1091  1364 Wilkes      0.199     3146     3725
#>  5 Ashe      0.114  1091  1364 Watauga     0.081     1323     1775
#>  6 Ashe      0.114  1091  1364 Avery       0.064      781      977
#>  7 Ashe      0.114  1091  1364 Caldwell    0.122     3609     4249
#>  8 Alleghany 0.061   487   542 Ashe        0.114     1091     1364
#>  9 Alleghany 0.061   487   542 Alleghany   0.061      487      542
#> 10 Alleghany 0.061   487   542 Surry       0.143     3188     3616
#> 11 Alleghany 0.061   487   542 Wilkes      0.199     3146     3725
#> 12 Alleghany 0.061   487   542 Watauga     0.081     1323     1775
#> 13 Alleghany 0.061   487   542 Yadkin      0.086     1269     1568
#> 14 Alleghany 0.061   487   542 Iredell     0.155     4139     5400
#> # ℹ 990 more rows

# ... and use group_by() / summarise() or just summarise(..., .by):
nc_unnested %>% 
  summarise(nb_bir74_mean = mean(nb.BIR74),
            nb_bir74_sum  =  sum(nb.BIR74),
            nb_area_sum   =  sum(nb.AREA), .by = NAME)
#> # A tibble: 100 × 4
#>    NAME        nb_bir74_mean nb_bir74_sum nb_area_sum
#>    <chr>               <dbl>        <dbl>       <dbl>
#>  1 Ashe                1946.        13625       0.784
#>  2 Alleghany           2092.        14643       0.839
#>  3 Surry               3111.        27997       1.06 
#>  4 Currituck            607          4856       0.576
#>  5 Northampton         2047.        18420       1.22 
#>  6 Hertford            1293.        12933       1.05 
#>  7 Camden               784.         8622       0.953
#>  8 Gates                920.         8284       0.813
#>  9 Warren              2366.        18925       1.08 
#> 10 Stokes              5660.        45276       0.998
#> # ℹ 90 more rows

创建于 2023-08-01,使用 reprex v2.0.2

© www.soinside.com 2019 - 2024. All rights reserved.