使用Python从msg文件中提取电子邮件地址

问题描述 投票:0回答:1

我是Python新手。我正在尝试编写一个脚本,从 Outlook .msg 文件中提取电子邮件地址。但是,某些消息文件可能附加有另一个消息文件。

def extract_emails_from_folder(folder_path, excluded_email):
# Create an empty set to store extracted email addresses
email_addresses = set()

# Iterate through each file in the folder
for filename in os.listdir(folder_path):
    # Check if the file is an MSG file
    if filename.endswith('.msg'):
        file_path = os.path.join(folder_path, filename)
        # Extract email addresses from the MSG file
        email_addresses.update(extract_emails_from_msg(file_path, excluded_email))

# Create a DataFrame to store the email addresses
df = pd.DataFrame({'Email Addresses': list(email_addresses)})
print(df)

msg 文件中的一些电子邮件地址附加在另一个 msg 文件中。我可以知道如何提取电子邮件地址吗?

python email-attachments
1个回答
0
投票

您可以使用

extract-msg
库:

import os
import re
import extract_msg
def extract_emails_from_msg(file_path, excluded_email):
    email_addresses = set()
    with extract_msg.Message(file_path) as msg:
        email_addresses.add(msg.sender) 
        email_addresses.update(msg.to)
        email_addresses.update(msg.cc)
        email_addresses.update(msg.bcc)
        email_addresses.update(re.findall(r'[\w\.-]+@[\w\.-]+', msg.body))
        for att in msg.attachments:
            if att.longFilename.endswith('.msg'):
                att_filename = os.path.join('/tmp', att.longFilename)
                att.saveCustom(att_filename)
                email_addresses.update(extract_emails_from_msg(att_filename, excluded_email))
                os.remove(att_filename)
    email_addresses.discard(excluded_email)
    return email_addresses
def extract_emails_from_folder(folder_path, excluded_email):
    email_addresses = set()
    for filename in os.listdir(folder_path):
        if filename.endswith('.msg'):
            file_path = os.path.join(folder_path, filename)
            email_addresses.update(extract_emails_from_msg(file_path, excluded_email))
    return email_addresses
folder_path = '/path/to/your/folder'
excluded_email = '[email protected]'
emails = extract_emails_from_folder(folder_path, excluded_email)
print(emails)
© www.soinside.com 2019 - 2024. All rights reserved.