删除数据集后如何获取geopandas的地图?

问题描述 投票:0回答:2

我发现从 geopandas 数据集加载世界地图非常简单且有用,可能与许多其他数据集一样,例如:

import geopandas as gpd

world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

但是,这给出了一个 FutureWarning,即数据集模块已被弃用,并将在将来被删除。有地图可供下载,例如从 https://www.naturalearthdata.com/downloads/110m-culture-vectors/ 但文件是压缩的,获取和处理文件似乎不是一个方便的工作流程从那里开始或不包含已处理的文件和源。

还有其他选择吗?最好的方法是什么,特别是如果我希望我的代码能够与未来版本的 Geopandas 一起使用?

python visualization geospatial geopandas
2个回答
1
投票

最简单的解决方案是将形状文件下载/存储在某处。

话虽这么说,如果(出于某种原因),您需要从源代码中读取它,您可以这样做way

import fsspec

url = "https://www.naturalearthdata.com/http//www.naturalearthdata.com/" \
      "download/110m/cultural/ne_110m_admin_0_countries.zip"
    
with fsspec.open(f"simplecache::{url}") as file:
    gdf = gpd.read_file(file)

输出:

          featurecla  scalerank  ...     FCLASS_UA                            geometry
0    Admin-0 country          1  ...          None  MULTIPOLYGON (((180.00000 -16.0...
1    Admin-0 country          1  ...          None  POLYGON ((33.90371 -0.95000, 34...
2    Admin-0 country          1  ...          None  POLYGON ((-8.66559 27.65643, -8...
..               ...        ...  ...           ...                                 ...
174  Admin-0 country          1  ...  Unrecognized  POLYGON ((20.59025 41.85541, 20...
175  Admin-0 country          1  ...          None  POLYGON ((-61.68000 10.76000, -...
176  Admin-0 country          1  ...          None  POLYGON ((30.83385 3.50917, 29....

[177 rows x 169 columns]


0
投票

对于我的 jupyter 笔记本,我有一个

tools.py
,用于存储重复使用的方法。其中之一是
get_shapes()
,用于获取经常需要的基本形状(
world
-国家、
us
-州、
de
-州)。

工作文件夹相对路径:

  • tools/tools.py
  • tools/__init__.py
    (空)

用作:

from pathlib import Path
module_path = str(Path.cwd() / "tools")
if module_path not in sys.path:
    sys.path.append(module_path)
from tools import tools

CRS_PROJ = "esri:54009" # Mollweide (e.g.)

world = tools.get_shapes(
    "world", shape_dir=Path.cwd() / "shapes")
world.to_crs(CRS_PROJ, inplace=True)
world.plot()

可以(肯定)改进,但这只是一个开始。如果需要的话可以延长:

import io
import csv
import numpy as np
import requests
import geopandas as gp
import zipfile
from pathlib import Path
from IPython.display import clear_output
from typing import List, Optional, Dict, Tuple
from IPython.core.display import display

def stream_progress(total_length: int, loaded: int):
    """Stream progress report"""
    clear_output(wait=True)            
    perc_str = ""
    if total_length:
        total = total_length/1000000
        perc = loaded/(total/100)
        perc_str = f"of {total:.2f} ({perc:.0f}%)"
    print(
        f"Loaded {loaded:.2f} MB "
        f"{perc_str}..")

def stream_progress_basic(total: int, loaded: int):
    """Stream progress report"""
    clear_output(wait=True)            
    perc_str = ""
    if total:
        perc = loaded/(total/100)
        perc_str = f"of {total:.0f} ({perc:.0f}%)"
    print(
        f"Processed {loaded:.0f} "
        f"{perc_str}..")

def get_stream_file(url: str, path: Path):
    """Download file from url and save to path"""
    chunk_size = 8192
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        total_length = return_total(r.headers)
        with open(path, 'wb') as f:
            for ix, chunk in enumerate(r.iter_content(chunk_size=chunk_size)): 
                f.write(chunk)
                loaded = (ix*chunk_size)/1000000
                if (ix % 100 == 0):
                    stream_progress(
                        total_length, loaded)
            stream_progress(
                total_length, loaded)
                        
def get_stream_bytes(url: str):
    """Stream file from url to bytes object (in-memory)"""
    chunk_size = 8192
    content = bytes()
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        total_length = return_total(r.headers)
        for ix, chunk in enumerate(r.iter_content(
                chunk_size=chunk_size)): 
            content += bytes(chunk)
            loaded = (ix*chunk_size)/1000000
            if (ix % 100 == 0):
                stream_progress(
                    total_length, loaded)
    stream_progress(
        total_length, loaded)
    return content
    
def get_folder_size(folder: Path):
    """Return size of all files in folder in MegaBytes"""
    if not folder.exists():
        raise Warning(
            f"Folder {folder} does not exist")
        return
    size_mb = 0
    for file in folder.glob('*'):
        size_mb += file.stat().st_size / (1024*1024)
    return size_mb
    
def get_zip_extract(
    uri: str, filename: str, output_path: Path, 
    create_path: bool = True, skip_exists: bool = True,
    report: bool = True, filter_files: List[str] = None,
    write_intermediate: bool = None):
    """Get Zip file and extract to output_path.
    Create Path if not exists."""
    if write_intermediate is None:
        write_intermediate = False
    if create_path:
        output_path.mkdir(
            exist_ok=True)
    if skip_exists and Path(
        output_path / filename.replace(".zip", ".csv")).exists():
        if report:
            print("File already exists.. skipping download..")
        return
    if write_intermediate:
        out_file = output_path / filename
        get_stream_file(f'{uri}{filename}', out_file)
        z = zipfile.ZipFile(out_file)
    else:
        content = get_stream_bytes(
            f'{uri}{filename}')
        z = zipfile.ZipFile(io.BytesIO(content))
    print("Extracting zip..")
    if filter_files:
        file_names = z.namelist()
        for filename in file_names:
            if filename in filter_files:
                z.extract(filename, output_path)
    else:
        z.extractall(output_path)
    if write_intermediate:
        if out_file.is_file():
            out_file.unlink()
    if report:
        raw_size_mb = get_folder_size(output_path)
        print(
            f"Retrieved {filename}, "
            f"extracted size: {raw_size_mb:.2f} MB")

def drop_cols_except(df: pd.DataFrame, columns_keep: List[str]):
    """Drop all columns from DataFrame except those specified in cols_except"""
    df.drop(
        df.columns.difference(columns_keep), axis=1, inplace=True)

def get_shapes(
        reference: str, shape_dir: Path,
        clean_cols: Optional[bool] = None, normalize_cols: Optional[bool] = None,
        set_index: Optional[bool] = None) -> gp.GeoDataFrame:
    """Custom method to get frequently used shapes (DE Bundesländer, US States)
    and return a geopandas.GeoDataFrame (WGS1984)

    reference: str - "us" and "de" are currently supported
    clean_cols: will remove all columns except geometry and state-reference. Defaults to True.
    normalize_cols: will rename columns to sane defaults. Defaults to True.
    set_index: will set state-reference as index column. Defaults to True.
    """
    if clean_cols is None:
        clean_cols = True
    if normalize_cols is None:
        normalize_cols = True
    if set_index is None:
        set_index = True
    target_name = "state"
    if reference == "us":
        source_zip = "https://www2.census.gov/geo/tiger/GENZ2018/shp/"
        filename = "cb_2018_us_state_5m.zip"
        shapes_name = "cb_2018_us_state_5m.shp"
        col_name = "NAME"
    elif reference == "de":
        source_zip = "https://daten.gdz.bkg.bund.de/produkte/vg/vg2500/aktuell/"
        filename = "vg2500_12-31.utm32s.shape.zip"
        shapes_name = "vg2500_12-31.utm32s.shape/vg2500/VG2500_LAN.shp"
        col_name = "GEN"
    elif reference == "world":
        source_zip = "https://naciscdn.org/naturalearth/110m/cultural/"
        filename = "ne_110m_admin_0_countries.zip"
        shapes_name = "ne_110m_admin_0_countries.shp"
        col_name = "SOVEREIGNT"
        target_name = "country"
    # create  temporary storage folder, if not exists already
    shape_dir.mkdir(exist_ok=True)
    # test if file already downloaded
    if not (shape_dir / shapes_name).exists():
        get_zip_extract(
            uri=source_zip, filename=filename, output_path=shape_dir)
    else:
        print("Already exists")
    shapes = gp.read_file(shape_dir / shapes_name)
    if clean_cols:
        drop_cols_except(df=shapes, columns_keep=["geometry", col_name])
    if normalize_cols:
        shapes.rename(columns={col_name: target_name}, inplace=True)
        col_name = target_name
    if set_index:
        shapes.set_index(col_name, inplace=True)
    return shapes.to_crs("EPSG:4326")
© www.soinside.com 2019 - 2024. All rights reserved.