我正在尝试编写一个脚本,从 Outlook .msg 文件中提取电子邮件地址。但是,某些 .msg 文件可能附加有另一个 .msg 文件。
def extract_emails_from_folder(folder_path, excluded_email):
# Create an empty set to store extracted email addresses
email_addresses = set()
# Iterate through each file in the folder
for filename in os.listdir(folder_path):
# Check if the file is an MSG file
if filename.endswith('.msg'):
file_path = os.path.join(folder_path, filename)
# Extract email addresses from the MSG file
email_addresses.update(extract_emails_from_msg(file_path, excluded_email))
# Create a DataFrame to store the email addresses
df = pd.DataFrame({'Email Addresses': list(email_addresses)})
print(df)
.msg 文件中的某些电子邮件地址附加在另一个 .msg 文件中。我可以知道如何也提取这些电子邮件地址吗?
您可以使用
extract-msg
库:
import os
import re
import extract_msg
def extract_emails_from_msg(file_path, excluded_email):
email_addresses = set()
with extract_msg.Message(file_path) as msg:
email_addresses.add(msg.sender)
email_addresses.update(msg.to)
email_addresses.update(msg.cc)
email_addresses.update(msg.bcc)
email_addresses.update(re.findall(r'[\w\.-]+@[\w\.-]+', msg.body))
for att in msg.attachments:
if att.longFilename.endswith('.msg'):
att_filename = os.path.join('/tmp', att.longFilename)
att.saveCustom(att_filename)
email_addresses.update(extract_emails_from_msg(att_filename, excluded_email))
os.remove(att_filename)
email_addresses.discard(excluded_email)
return email_addresses
def extract_emails_from_folder(folder_path, excluded_email):
email_addresses = set()
for filename in os.listdir(folder_path):
if filename.endswith('.msg'):
file_path = os.path.join(folder_path, filename)
email_addresses.update(extract_emails_from_msg(file_path, excluded_email))
return email_addresses
folder_path = '/path/to/your/folder'
excluded_email = '[email protected]'
emails = extract_emails_from_folder(folder_path, excluded_email)
print(emails)
不要忘记安装库🤠:
pip install extract-msg
您可以使用“extract_msg”库
pip install extract-msg
from extract_msg import Message
def extract_emails_from_folder(folder_path, excluded_email):
try:
msg = Message(folder_path)
if(msg.sender_email not in exlcude_email) and (msg.receiver_email not in exlcude_email) and (msg.cc not in exlcude_email):
return [msg.sender_email, msg.receiver_email, msg.cc]
else:
return []
except Exception as e:
print(f"Error extracting email addresses from {folder_path}: {e}")
return []
excluded_email = ["Your Excluded Mails here"]
email_addresses = set()
# Iterate through each file in the folder
for filename in os.listdir(folder_path):
# Check if the file is an MSG file
if filename.endswith('.msg'):
file_path = os.path.join(folder_path, filename)
# Extract email addresses from the MSG file
extracted_mail = extract_emails_from_msg(file_path, excluded_email)
if(extracted_mail==[]):
pass
email_addresses.update(extracted_mail)
# Create a DataFrame to store the email addresses
df = pd.DataFrame({'Email Addresses': list(email_addresses)})
print(df)