尝试使用 urlib 从网站下载文件时出现错误 403

问题描述 投票:0回答:1

我尝试从该链接下载这 2 个文件,但它不断返回 403 禁止错误,该网站没有提到需要任何身份验证才能访问文件。


files = [
    'sm/sm.data.1.AllData',
    'ce/ce.data.0.AllCESSeries',
]

dir = os.path.abspath(os.path.dirname(__file__))
datadir = dir + "\\data"
os.chdir(datadir)

data_hostname = "http://download.bls.gov/pub/time.series/"
current_filesystem = datadir

def download_data():
    for filename in files: # Loop through the files in files dictonary
        filename_extension = filename[3:] + ".txt" # Filename munge
        data_location = data_hostname + "" + filename # file name location
        full_filepath = current_filesystem + "/" + filename_extension # full location
        print("downloading from: " + data_location)
        urllib.request.urlretrieve(data_location, full_filepath) # grab that shit
        print("download path: " + full_filepath)
        urllib.request.urlcleanup()
    print("Finished Downloading Data")
python urllib http-status-code-403
1个回答
0
投票

您的代码缺少几个必需的 HTTP 标头。具体来说,用户代理和接受语言

使用 requests (如果您还没有,则需要安装它),您可以执行以下操作:

import requests
import os
from pathlib import Path

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
    "Accept-Language": "en-GB,en;q=0.9,en-US;q=0.8,pt;q=0.7",
}

CHUNK = 128 * 1024

data_hostname = "http://download.bls.gov/pub/time.series/"

files = [
    'sm/sm.data.1.AllData',
    'ce/ce.data.0.AllCESSeries',
]

target_dir = Path(os.path.join(os.path.abspath(os.path.dirname(__file__)))) / "data"
target_dir.mkdir(exist_ok=True)

for file in files:
    with requests.get(data_hostname + file, headers=HEADERS, stream=True) as response:
        response.raise_for_status()
        target_file = file.split("/")[-1] + ".txt"
        with open(target_dir / target_file, "wb") as output:
            for chunk in response.iter_content(CHUNK):
                output.write(chunk)
© www.soinside.com 2019 - 2024. All rights reserved.