测试我的 Lambda 函数时。我总是得到这个错误,这显然对调试没有帮助:
OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k
START RequestId: a22379c9-427c-45ab-bfd5-c166bf507418 Version: $LATEST
2023-04-29T15:45:42.496Z a22379c9-427c-45ab-bfd5-c166bf507418 Task timed out after 600.11 seconds
END RequestId: a22379c9-427c-45ab-bfd5-c166bf507418
REPORT RequestId: a22379c9-427c-45ab-bfd5-c166bf507418 Duration: 600108.83 ms Billed Duration: 601489 ms Memory Size: 256 MB Max Memory Used: 154 MB Init Duration: 1379.93 ms
该函数基于 ECR 上的图像执行 docker 容器。
我的 lambda 函数处理程序如下所示:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import math
from time import sleep
import pandas as pd
from datetime import date
from requests.adapters import HTTPAdapter
import boto3
from io import BytesIO
COLUMNS = [<<list of columns for the pandas df>> ]
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
"Accept-Encoding": "gzip, deflate, br",
"Referer": "https://www.bbc.com/news/entertainment-arts-64759120",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Connection": "keep-alive",
"Accept-Language": "en-US,en;q=0.9,lt;q=0.8,et;q=0.7,de;q=0.6"}
KEYWORDS = [ <<list of keywords>>]
def make_get_request(max_retries, URL):
session = requests.Session()
retry = HTTPAdapter(max_retries=max_retries)
session.mount('https://', retry)
session.mount('http://', retry)
try:
response = session.get(URL, headers=HEADERS)
return response
except (ConnectionError, requests.exceptions.Timeout) as err:
print(f"Failed to connect to the API, retrying... Error: {err}")
make_get_request(max_retries)
except requests.exceptions.TooManyRedirects as err:
print("Bad URL, try a different one")
raise SystemExit(err)
except requests.exceptions.HTTPError as err:
raise SystemExit(err)
def get_pages()->int:
BASE_URL="https://www.stepstone.de/jobs/data-engineer?action=facet_selected%3bage%3bage_1&ag=age_1"
response = make_get_request(3, BASE_URL)
soup = BeautifulSoup(response.text, 'html.parser')
result_tag = soup.find('span', 'res-kyg8or at-facet-header-total-results')
results = result_tag.text
results = results.replace(".", "")
pages = math.ceil(int(results)/25)
return pages
def extract_job_cards(title, days, i):
URL = f"https://www.stepstone.de/jobs/{title}?page_selected={i}&action=facet_selected%3bage%3bage_{days}&ag=age_{days}"
response = make_get_request(3, URL)
soup = BeautifulSoup(response.text, 'html.parser')
cards = soup.find_all('article', 'res-iibro8')
return cards
def request_job_description(href):
DESCRIPTION_LINK = "https://www.stepstone.de" + href
response_details = make_get_request(3, DESCRIPTION_LINK)
return response_details
def extract_keywords(card):
atag = card.h2.a
title_card = atag.div
title_card_str = title_card.text
title = title_card_str.split("</div></div></div>")[0]
print(title)
company_name = card.div.span.text
href = atag["href"]
occured_keywords = []
response_soup = BeautifulSoup(request_job_description(href).text, 'html.parser')
boxes = response_soup.find_all('div', 'listing-content-provider-10ltcrf')
if len(boxes)==4:
del boxes[0]
del boxes[-1]
for box in boxes:
text = box.find('span', 'listing-content-provider-pz58b2').get_text()
for keyword in KEYWORDS:
if keyword.upper() in text.upper():
occured_keywords.append(keyword.upper())
occured_keywords = list(dict.fromkeys(occured_keywords))
return occured_keywords, title, href, company_name
def append_to_df(occured_keywords, job_df, title, company_name, href):
job_dict = {
"LOADED_AT": date.today(),
"JOB_TITLE": title,
"COMPANY": company_name,
"HREF_LINK": href,
'PYTHON': 0
#There are obviously more key value pairs bit I left them out of #this post for simplicity reasons
}
for skill in occured_keywords:
job_dict[skill] = 1
row = []
for value in job_dict.values():
row.append(value)
job_df.loc[len(job_df)] = row
def extract_and_append_skills(cards, job_df):
for card in cards:
if all(job not in card.h2.a.div.text for job in ["Data Engineer", "DWH", "Data Warehouse", "ETL", "Analytics Engineer", "Business Intelligence", "Data Platform", "Data Architekt", "Data Architect"]):
continue
else:
keywords, title, href, company_name = extract_keywords(card)
append_to_df(keywords, job_df, title, company_name, href)
def main(event, context):
job_df = pd.DataFrame(columns=COLUMNS)
try:
for i in range(get_pages()):
cards = extract_job_cards('data-engineer', 1, i)
extract_and_append_skills(cards, job_df)
job_df = job_df[~job_df.duplicated(subset=['HREF_LINK'])].copy()
print(len(job_df))
return "Success"
except Exception as e:
print(e)
像这样的 Dockerfile:
FROM public.ecr.aws/lambda/python:3.9
COPY stepstone_scraper.py ${LAMBDA_TASK_ROOT}
COPY requirements.txt ./
RUN pip install -r requirements.txt -t "${LAMBDA_TASK_ROOT}"
RUN chmod 644 $(find . -type f)
RUN chmod 755 $(find . -type d)
CMD ["stepstone_scraper.main"]
像这样在 terraform 中创建 lambda 函数:
resource "aws_lambda_function" "job_scraping_function" {
package_type = "Image"
image_uri = "${aws_ecr_repository.scraping_repo.repository_url}:latest"
function_name = "job_scraping_function"
role = aws_iam_role.lambda_s3_role.arn
memory_size = 256
timeout = 600
depends_on = [null_resource.docker_build_and_push]
}
底层角色可以由 lambda 承担,具有完整的 s3、ec2、lambda、ECR 和 cloudwatch 访问权限,AWSLambdaBasicExecutionRole arn 附加到它。
有没有人知道我的问题可能是什么?
你的
def main(event, context):
缩进不正确,所以这可能就是 lambda 不能使用 main 函数的原因。
你几乎没有记录任何东西,所以我希望你看到的是确切的日志。您需要添加更多日志以查看超时前发生的情况。这不是“获取正确日志”的问题,而是您的 Lambda 函数超时的问题。
超时的原因很可能是因为您发出的网络请求在 Lambda 环境中不起作用。您很可能已将 Lambda 函数配置为在 VPC 中运行,而没有将其部署到具有 NAT 网关路由的子网。如果不是这种情况,那么您尝试在 Internet 上访问的 URL 可能已阻止 AWS IP。