我有以下代码,应该从 MDPI(医学数据库)打开一个网页,并提取 20 篇文章的列表。然后,转到每篇文章的 URL,并提取在作者列表中找到的第一封电子邮件。我可以提取标题,但不能提取作者姓名或电子邮件。
这是一个例子:
有人可以帮助调整代码以特别拉取第一作者的电子邮件吗?
import requests
from bs4 import BeautifulSoup
# Function to extract email from a given article page URL
def extract_email(article_url):
try:
article_response = requests.get(article_url)
if article_response.status_code == 200:
article_soup = BeautifulSoup(article_response.text, 'html.parser')
# Find the email within the art-authors hypothesis_container
email_tag = article_soup.select_one('.art-authors.hypothesis_container a.toEncode.emailCaptcha')
if email_tag and email_tag.has_attr('href') and email_tag['href'].startswith("mailto:"):
return email_tag['href'][7:] # Extract the email address from the href attribute
else:
return "Email not found"
else:
return "Failed to retrieve article page"
except Exception as e:
return str(e)
# URL of MDPI's journal homepage
journal_url = 'https://www.mdpi.com/journal/biology'
# Send a GET request to the journal homepage
response = requests.get(journal_url)
# Check if request was successful
if response.status_code == 200:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
# Find the elements containing article information
articles = soup.find_all('div', class_='article-content')
# Limit the number of articles to 20
articles = articles[:20]
# Open a file in write mode to save the output
with open('mdpi_articles.txt', 'w', encoding='utf-8') as file:
# Write the header
file.write("Title\tAuthors\tPublication Date\tEmail of First Author\n")
# Extract and write information for each article
for article in articles:
try:
# Extracting title and publication date
title_element = article.find('a', class_='title-link')
title = title_element.text.strip() if title_element else "Title not found"
pub_date_element = article.find('div', class_='item-pubdate')
pub_date = pub_date_element.text.strip() if pub_date_element else "Publication date not found"
# Find authors within the article's HTML content
authors = "Authors not found"
for tag in article.find_all(["a", "span", "div"]):
if "author" in tag.get('class', '') or "author" in tag.get('id', '') or tag.name == 'a' or tag.name == 'span':
authors = tag.get_text(separator=', ').strip()
break
# Extract the URL of the article page
article_url = title_element['href'] if title_element and 'href' in title_element.attrs else None
# If URL is found, extract the email of the first author from the article page
if article_url:
# Check if the URL starts with "/"
if article_url.startswith("/"):
# Construct the complete URL by adding the hostname part
article_url = "https://www.mdpi.com" + article_url
# Extract email from the article page
email = extract_email(article_url)
else:
email = "Article URL not found"
# Write the extracted information to the file
file.write(f"{title}\t{authors}\t{pub_date}\t{email}\n")
except Exception as e:
print(f"Error: {e}. Skipping article.")
print("Data saved to mdpi_articles.txt")
else:
print('Failed to retrieve data:', response.status_code)
我知道如何使用 python 请求获取电子邮件