我正在使用
aiohttp
使用图像 URL 从不同站点异步获取下载图像。之前,我使用 requests.get
来同步执行相同的操作。我能够成功使用 requests.get
下载图像,但当我尝试使用 aiohttp 下载图像时,相同的 URL 会抛出 403 Forbidden
错误。我尝试找出可能出现的问题,但到目前为止还没有取得任何成功。 requests.get
不需要任何额外的标题来获取图像。 URL 很重要,因为该网站的 URL 出现此 403 错误。
requests.get
版本:
import requests
from io import BytesIO
async def download_image(self, url: str):
## is_url is just a small function which returns True if url is valid
if not is_url(url):
return None
VALID_MIME_TYPES = {
"image/jpeg": ".jpeg",
"image/png": ".png",
"image/jpg": ".jpg",
"image/gif": ".gif",
"image/tiff": ".tiff",
"image/webp": ".webp",
"image/apng": ".apng",
"image/svg+xml": ".svg",
"application/octet-stream": get_file_extension_from_url(url=url)
# get_file_extension_from_url used to get image's type from the URL
}
response = requests.get(url) # Worked successfully and downloads the image
mimetype = response.headers.get("Content-Type", "").lower()
if mimetype in VALID_MIME_TYPES:
# creating file name for file
file_name = f"cimage.{VALID_MIME_TYPES[mimetype]}"
content = response.content
# converting to BytesIO stream
return BytesIO(content), file_name, mimetype
else:
return None
image_url = "https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc.com%2Fimages%2F602272019%2F1430182031443%2F1%2Foriginal.20230920-130504?w=940&auto=format%2Ccompress&q=75&sharp=10&rect=0%2C15%2C1200%2C600&s=13645e838fd09f2552c8f8500410abec"
image_output = download_image(image_url)
aiohttp
版本:
import aiohttp, asyncio
from io import BytesIO
async def download_image(self, url: str, session: aiohttp.ClientSession):
"""
Args:
url (str): image url
session (aiohttp.ClientSession): Using a common aiohttp session for speedup
"""
if not is_url(url):
return None
VALID_MIME_TYPES = {
"image/jpeg": ".jpeg",
"image/png": ".png",
"image/jpg": ".jpg",
"image/gif": ".gif",
"image/tiff": ".tiff",
"image/webp": ".webp",
"image/apng": ".apng",
"image/svg+xml": ".svg",
"application/octet-stream": get_file_extension_from_url(url=url)
}
res = await session.request(method="GET", url=url)
mimetype = res.headers.get("Content-Type", "").lower()
if mimetype in VALID_MIME_TYPES:
file_name = f"cimage.{VALID_MIME_TYPES[mimetype]}"
content = await res.read()
return BytesIO(content), file_name, mimetype
else:
return None
if __name__ == "__main__":
image_url = "https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc.com%2Fimages%2F602272019%2F1430182031443%2F1%2Foriginal.20230920-130504?w=940&auto=format%2Ccompress&q=75&sharp=10&rect=0%2C15%2C1200%2C600&s=13645e838fd09f2552c8f8500410abec"
image_urls = [image_url] * 1 # I just increase the number for testing
async def main():
async with aiohttp.ClientSession(trust_env=True) as session:
tasks = []
for url in image_urls:
task = asyncio.create_task(download_image(url=url, session=session))
tasks.append(task)
# returns list of output image in 3 item tuple.
images = await asyncio.gather(*tasks)
asyncio.run(main())
这是 aiohttp 请求完成后
res
返回的打印输出:
<ClientResponse(https://img.evbuc.com/https:%2F%2Fcdn.evbuc.com%2Fimages%2F602272019%2F1430182031443%2F1%2Foriginal.20230920-130504?w=940&auto=format,compress&q=75&sharp=10&rect=0,15,1200,600&s=13645e838fd09f2552c8f8500410abec) [403 Forbidden]>
<CIMultiDictProxy('Content-Type': 'text/plain', 'Content-Length': '14', 'Connection': 'keep-alive', 'Cache-Control': 'public, max-age=5', 'Server': 'imgix', 'x-imgix-id': 'e3fb1d9c2f4cdf79dc45ca6fa20455560bdc05a5', 'x-imgix-proxy-status': '403', 'x-imgix-proxy-reason': '', 'X-Imgix-Render-Farm': '01.140360', 'Date': 'Wed, 18 Oct 2023 20:11:20 GMT', 'Accept-Ranges': 'bytes', 'Access-Control-Allow-Origin': '*', 'Timing-Allow-Origin': '*', 'Cross-Origin-Resource-Policy': 'cross-origin', 'X-Content-Type-Options': 'nosniff', 'X-Served-By': 'cache-sjc10076-SJC, cache-bom4734-BOM', 'X-Cache': 'Error from cloudfront', 'Via': '1.1 9e8c29342ff6f7610166562f3559cbe4.cloudfront.net (CloudFront)', 'X-Amz-Cf-Pop': 'BOM78-P1', 'X-Amz-Cf-Id': 'cFMR0YKkz5pLrgzH-IkmYd0JTYqgZPT-wDKbTdxDiOr_ZJH_v3xLeg==', 'Age': '0')>
我的情况有什么问题吗?
我希望我能得到解决方案。谢谢。
需要明确的是,这个 aiohttp 代码适用于其他 URL,但我遇到了这种类型的 URL 的奇怪问题。
aiohttp
规范化 URL(这不是 requests
的作用,因此请求成功)。您可以使用 yarl.URL
和 encoded=True
: 禁用此行为
import asyncio
import aiohttp
import yarl
async def download_image(url: str, session: aiohttp.ClientSession):
"""
Args:
url (str): image url
session (aiohttp.ClientSession): Using a common aiohttp session for speedup
"""
url = yarl.URL(url, encoded=True)
res = await session.request(method="GET", url=url)
print(res)
print()
if __name__ == "__main__":
image_url = "https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc.com%2Fimages%2F602272019%2F1430182031443%2F1%2Foriginal.20230920-130504?w=940&auto=format%2Ccompress&q=75&sharp=10&rect=0%2C15%2C1200%2C600&s=13645e838fd09f2552c8f8500410abec"
image_urls = [image_url] * 1 # I just increase the number for testing
async def main():
async with aiohttp.ClientSession(trust_env=True) as session:
tasks = []
for url in image_urls:
task = asyncio.create_task(download_image(url=url, session=session))
tasks.append(task)
images = await asyncio.gather(*tasks)
asyncio.run(main())
打印:
<ClientResponse(https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc.com%2Fimages%2F602272019%2F1430182031443%2F1%2Foriginal.20230920-130504?w=940&auto=format%2Ccompress&q=75&sharp=10&rect=0%2C15%2C1200%2C600&s=13645e838fd09f2552c8f8500410abec) [200 OK]>
<CIMultiDictProxy('Content-Type': 'image/jpeg', 'Content-Length': '94266', 'Connection': 'keep-alive', 'Last-Modified': 'Wed, 20 Sep 2023 13:07:41 GMT', 'Cache-Control': 'public, max-age=315360001', 'Server': 'imgix', 'x-imgix-id': 'ec36b5116879ca860a0352578065a6c96481160e', 'X-Imgix-Render-Farm': '01.140360', 'Date': 'Wed, 18 Oct 2023 21:05:54 GMT', 'Accept-Ranges': 'bytes', 'Access-Control-Allow-Origin': '*', 'Timing-Allow-Origin': '*', 'Cross-Origin-Resource-Policy': 'cross-origin', 'X-Content-Type-Options': 'nosniff', 'X-Served-By': 'cache-sjc10040-SJC, cache-fra-eddf8230087-FRA', 'X-Cache': 'Miss from cloudfront', 'Via': '1.1 41b7bdf4fb536a6c72b9f49d9b6affe8.cloudfront.net (CloudFront)', 'X-Amz-Cf-Pop': 'PRG50-C1', 'X-Amz-Cf-Id': 'iYy-EXyB519KYFu_luKo9bAMnMvANxcrHEj6-Sps0LYWJ5cx60Rbvg==', 'Age': '2447892')>