我是Python新手。我正在尝试编写一个脚本,从 Outlook .msg 文件中提取电子邮件地址。但是,某些消息文件可能附加有另一个消息文件。
def extract_emails_from_folder(folder_path, excluded_email):
# Create an empty set to store extracted email addresses
email_addresses = set()
# Iterate through each file in the folder
for filename in os.listdir(folder_path):
# Check if the file is an MSG file
if filename.endswith('.msg'):
file_path = os.path.join(folder_path, filename)
# Extract email addresses from the MSG file
email_addresses.update(extract_emails_from_msg(file_path, excluded_email))
# Create a DataFrame to store the email addresses
df = pd.DataFrame({'Email Addresses': list(email_addresses)})
print(df)
msg 文件中的一些电子邮件地址附加在另一个 msg 文件中。我可以知道如何提取电子邮件地址吗?
您可以使用
extract-msg
库:
import os
import re
import extract_msg
def extract_emails_from_msg(file_path, excluded_email):
email_addresses = set()
with extract_msg.Message(file_path) as msg:
email_addresses.add(msg.sender)
email_addresses.update(msg.to)
email_addresses.update(msg.cc)
email_addresses.update(msg.bcc)
email_addresses.update(re.findall(r'[\w\.-]+@[\w\.-]+', msg.body))
for att in msg.attachments:
if att.longFilename.endswith('.msg'):
att_filename = os.path.join('/tmp', att.longFilename)
att.saveCustom(att_filename)
email_addresses.update(extract_emails_from_msg(att_filename, excluded_email))
os.remove(att_filename)
email_addresses.discard(excluded_email)
return email_addresses
def extract_emails_from_folder(folder_path, excluded_email):
email_addresses = set()
for filename in os.listdir(folder_path):
if filename.endswith('.msg'):
file_path = os.path.join(folder_path, filename)
email_addresses.update(extract_emails_from_msg(file_path, excluded_email))
return email_addresses
folder_path = '/path/to/your/folder'
excluded_email = '[email protected]'
emails = extract_emails_from_folder(folder_path, excluded_email)
print(emails)