我有一个 python 脚本,它使用 selenium 在 ubuntu 服务器上进行网页抓取。我在每次迭代的 while 循环中创建新的驱动程序对象,并在使用后退出它。 该脚本在一段时间内运行良好。然后它停止创建新驱动程序,但出现异常“打开文件太多”。
这是脚本:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import requests
import os
import logging
from PyPDF2 import PdfReader
from webdriver_manager.chrome import ChromeDriverManager
from openai import OpenAI
import undetected_chromedriver as uc
import time
import shutil
import glob
import logging
import csv
import traceback
logging.basicConfig(format='%(asctime)s - %(name)s - %(process)d - %(threadName)s - %(levelname)s - %(message)s',
level=logging.WARNING,
handlers=[logging.FileHandler("/root/nse-announcements/logs-warnings.log"),
logging.StreamHandler()])
url = 'https://www.nseindia.com/companies-listing/corporate-filings-announcements'
bot_token = '7069953058:AAGsJ-hihPjME'
bot_chatID = os.getenv('TELEGRAM_BOT_CHAT_ID')
openai_api_key = os.getenv('OPENAI_API_KEY')
openAI_view = ''
first_page_text = ''
path_to_pdf = ''
pdf_link = ''
subject = ''
pdf_link_file_path = '/root/nse-announcements/announcements_pdf_links.txt'
temp_pdf = '/root/nse-announcements/temp.pdf'
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
download_directory = "/root/nse-announcements/pdfs-of-announcements"
csv_file_path = '/root/nse-announcements/nse_announcements.csv'
csv_headers = ['Date', 'Symbol', 'Company Name', 'Subject', 'Details', 'OpenAI View', 'Broadcast Date/Time', 'PDF Link']
with requests.Session() as session:
response = session.get(f'https://api.telegram.org/bot{bot_token}/sendMessage',
params={'chat_id': bot_chatID, 'text': "Hello Unix User! I'm now active and ready to fetch NSE announcements for you."})
def clear_tmp_directory():
tmp_path = '/tmp'
tmp_dirs = glob.glob(f"{tmp_path}/tmp*" )
for directory in tmp_dirs:
try:
shutil.rmtree(directory)
except Exception as e:
pass
def clean_scoped_directories():
tmp_path = '/tmp'
scoped_dirs = glob.glob(f"{tmp_path}/scoped_dir*")
for directory in scoped_dirs:
try:
shutil.rmtree(directory)
except Exception as e:
pass
# logging.debug(f"Failed to remove {directory}: {str(e)}")
chrome_items = glob.glob(f"{tmp_path}/.com.google.Chrome.*")
for item in chrome_items:
try:
if os.path.isdir(item):
shutil.rmtree(item)
else:
os.remove(item)
except Exception as e:
# Check if the error is because the item is not a directory
if "Errno 20" in str(e):
try:
os.remove(item) # Attempt to delete it as a file
except Exception as e:
pass
# logging.debug(f"Failed to remove {item} as a file: {e}")
# else:
# logging.debug(f"Failed to remove {item}: {e}")
def clean_text(text):
return text.replace('&', '').replace('<', '').replace('>', '').replace('"', '').replace("'", "")
def send_telegram_message(data):
message = f"<b>Symbol:</b> {data['Symbol']}\n" \
f"<b>{data['pref_alert']}</b>\n" \
f"<b>Company Name:</b> {data['Company Name']}\n" \
f"<b>Broadcast Date/Time:</b> {data['Broadcast Date/Time']}\n" \
f"<b>Subject:</b> {data['Subject']}\n" \
f"<b>Details:</b> {data['Details']}\n" \
f"<b>OpenAI:</b> {data['OpenAI']}\n" \
f"<b>PDF Link:</b> {data['PDF Link']}"
url = f'https://api.telegram.org/bot{bot_token}/sendMessage'
params = {
'chat_id': bot_chatID,
'parse_mode': 'HTML',
'text': message
}
max_attempts = 30 # Maximum number of attempts to send the message
attempt = 0
connect_timeout, read_timeout = 10, 30 # Timeouts
delay_seconds = 1 # Initial delay for retries
while attempt < max_attempts:
try:
with requests.Session() as session:
response = session.get(url, params=params, timeout=(connect_timeout, read_timeout))
if response.status_code == 200:
return True
elif response.status_code in {301, 302, 307, 308}:
# Handle redirection, if needed
url = response.headers['Location']
elif 400 <= response.status_code < 500:
logging.warning(f'Client error: {response.status_code} - {response.text}')
break # Stop retrying for client errors
elif 500 <= response.status_code < 600:
logging.warning(f'Server error: {response.status_code} - {response.text}')
# Consider retrying or handling server-side errors
else:
logging.warning(f'Unhandled status code: {response.status_code}')
except Exception as e:
logging.warning(f'Exception in send_telegram_message: {str(e)}')
time.sleep(delay_seconds)
delay_seconds *= 2 # Exponential backoff
finally:
attempt += 1 # Ensure attempt is incremented
return False
def wait_for_download(path_to_pdf):
timeout = 5 # Maximum time to wait for the download to appear (in seconds)
start_time = time.time()
while True:
elapsed_time = time.time() - start_time
if os.path.exists(path_to_pdf) and os.path.getsize(path_to_pdf) > 0:
return True
if elapsed_time > timeout:
return False
time.sleep(1)
def process_pdf(pdf_link):
global path_to_pdf, openAI_view
driver = None
try:
chrome_options = uc.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument('log-level=3')
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-webgl")
chrome_options.add_experimental_option('prefs', {
"download.default_directory": download_directory,
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True
})
driver = uc.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
time.sleep(2)
driver.execute_cdp_cmd('Network.setUserAgentOverride', {'userAgent': user_agent})
pdf_file_name = pdf_link.split('/')[-1]
path_to_pdf = os.path.join(download_directory, pdf_file_name)
for attempt in range(10): # Retry attempts
driver.get(pdf_link)
if wait_for_download(path_to_pdf):
with open(path_to_pdf, 'rb') as file:
reader = PdfReader(file)
first_page = reader.pages[0]
first_page_text = first_page.extract_text()
if first_page_text:
openAI_view = 'first_page_text'
return first_page_text
else:
logging.warning(f'PDF {pdf_link} first page contains image only content or no text.')
openAI_view = 'image PDF'
os.remove(path_to_pdf) # Move deletion outside of the try-except
return False
time.sleep(5) # Wait 5 seconds before the next retry
logging.warning(f"Failed to fetch PDF {pdf_link} after multiple attempts.")
openAI_view = 'failed to fetch PDF'
return False
except Exception as e:
logging.warning(f'Exception in process_pdf: {str(e)}')
openAI_view = 'error in processing PDF'
return False
finally:
if driver:
driver.quit()
def summarize_text(text):
client = OpenAI(api_key=openai_api_key)
if client.api_key is None:
return "API key not found in environment variables"
system_msg = 'You know what is most relevant and important for the investors of a company.'
user_msg = f'Summarize important information (ignore addresses) in company announcement text: {text}'
# system_msg = 'Identify and summarize key details from company announcements related specifically to fund raising through equity shares, warrants, or other securities. Focus on details such as the type of offering, amount being raised, type of investors, and any relevant approvals or voting outcomes. Ignore routine operational details and addresses.'
# user_msg = f'Analyze this announcement text and summarize any sections related to fund raising through equity shares or warrants: {text}'
attempts = 0
max_attempts = 3 # Set the number of attempts to 3
while attempts < max_attempts:
attempts += 1
# Create a chat completion using the client
chat_completion = client.chat.completions.create(
messages=[
{"role": "system", "content": system_msg},
{"role": "user", "content": user_msg}
],
model="gpt-3.5-turbo",
temperature=0.3,
max_tokens=150,
)
reason = chat_completion.choices[0].finish_reason
# Handling for the 'stop' finish reason
if reason == 'stop':
return chat_completion.choices[0].message.content
# Handling for the 'length' finish reason (output truncated due to max tokens)
elif reason == 'length':
return f"Output truncated: {chat_completion.choices[0].message.content}"
# Handling for the 'function_call' finish reason (model initiated a function call)
elif reason == 'function_call':
return "Function call initiated by the model."
# Handling for the 'content_filter' finish reason (content omitted by filters)
elif reason == 'content_filter':
return "Content filtered due to safety protocols."
# Handling for the 'null' finish reason (response still in progress or incomplete)
elif reason == 'null':
return "Response still in progress or incomplete."
time.sleep(2)
return "Failed to summarize text after maximum attempts."
def write_to_csv(data):
with open(csv_file_path, 'a', newline='') as csvfile: # 'a' mode to append to the file
writer = csv.DictWriter(csvfile, fieldnames=csv_headers)
writer.writerow(data)
existing_links = set()
# Load existing links from file if it exists
try:
with open(pdf_link_file_path, 'r') as file:
existing_links = set(line.strip() for line in file)
except FileNotFoundError:
pass
try:
with open(csv_file_path, 'x', newline='') as csvfile: # 'x' mode to create and fail if exists
writer = csv.DictWriter(csvfile, fieldnames=csv_headers)
writer.writeheader()
except FileExistsError:
pass
avoidable_keywords = ['74(5)', 'GST authority', 'delisting']
avoidable_subjects_details = ['Analysts/Institutional', 'FDA Inspection']
while True:
driver = None
try:
clean_scoped_directories()
clear_tmp_directory()
chrome_options = uc.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument('log-level=3')
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-webgl")
chrome_options.add_experimental_option('prefs', {
"download.default_directory": download_directory,
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True
})
driver = uc.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
time.sleep(2)
driver.execute_cdp_cmd('Network.setUserAgentOverride', {'userAgent': user_agent})
wait = WebDriverWait(driver, 20)
driver.get(url)
time.sleep(5)
wait.until(EC.presence_of_element_located(
(By.XPATH, '//table[@id="CFanncEquityTable"]/tbody/tr[1]/td[6]')))
html_source = driver.page_source
soup = BeautifulSoup(html_source, 'html.parser')
table = soup.find('table', {'id': 'CFanncEquityTable'})
driver.quit()
if not table or table.find_all('a', href=True) == []:
continue
for row in table.find_all('tr'):
openAI_view = ''
first_page_text = ''
path_to_pdf = ''
pdf_link = ''
subject = ''
pref_alert = ''
cells = row.find_all('td')
if len(cells) > 4:
pdf_link = cells[4].find('a', href=True)['href'] if cells[4].find('a', href=True) and cells[4].find('a', href=True)['href'].endswith('.pdf') else None
subject = clean_text(cells[2].text.strip())
details = clean_text(cells[3].text.strip())
company_name = clean_text(cells[1].text.strip())
broadcast_date_time = cells[5].text.strip().split("Exchange")[0].strip()
symbol = cells[0].text.strip()
original_symbol = symbol
symbol = clean_text(symbol)
if symbol != original_symbol:
symbol = symbol + " (Edited)"
if pdf_link and pdf_link not in existing_links:
found_subject = None
for keyword in avoidable_subjects_details:
if keyword.lower() in subject.lower() or keyword.lower() in details.lower():
found_subject = keyword
break
if found_subject is not None:
logging.warning(f'************** SKIPPING PDF SINCE IT CONTAINS THE AVOIDABLE SUBJECT/DETAIL: {found_subject} ***************')
existing_links.add(pdf_link)
with open(pdf_link_file_path, 'a') as file:
file.write(pdf_link + '\n')
data = {
'Date': time.strftime("%Y-%m-%d"),
'Symbol': symbol,
'Company Name': company_name,
'Subject': subject,
'Details': details,
'OpenAI View': 'SKIP',
'Broadcast Date/Time': broadcast_date_time,
'PDF Link': pdf_link
}
write_to_csv(data)
continue
first_page_text = process_pdf(pdf_link)
if first_page_text:
os.remove(path_to_pdf)
found_keyword = None
preferential_issue_keywords = ['preferential issue', 'equity shares', 'convertible warrants', 'raising funds']
fund_raising_found = 'preferential' in first_page_text.lower() and any(keyword in first_page_text.lower() for keyword in preferential_issue_keywords)
if fund_raising_found:
pref_alert = '*** PREFERENTIAL ISSUE ALERT ***'
else:
for keyword in avoidable_keywords:
if keyword.lower() in first_page_text.lower():
found_keyword = keyword
break
if found_keyword is not None:
logging.warning(f'************** SKIPPING PDF SINCE IT CONTAINS THE AVOIDABLE KEYWORD: {found_keyword} ***************')
existing_links.add(pdf_link)
with open(pdf_link_file_path, 'a') as file:
file.write(pdf_link + '\n')
data = {
'Date': time.strftime("%Y-%m-%d"),
'Symbol': symbol,
'Company Name': company_name,
'Subject': subject,
'Details': details,
'OpenAI View': 'SKIP',
'Broadcast Date/Time': broadcast_date_time,
'PDF Link': pdf_link
}
write_to_csv(data)
continue
openAI_view = summarize_text(first_page_text)
data = {
'Symbol': symbol,
'Company Name': company_name,
'Subject': subject,
'Details': details,
'OpenAI': openAI_view,
'Broadcast Date/Time': broadcast_date_time,
'PDF Link': pdf_link
}
should_link_be_saved = send_telegram_message(data)
if should_link_be_saved:
existing_links.add(pdf_link)
with open(pdf_link_file_path, 'a') as file:
file.write(pdf_link + '\n')
data = {
'Date': time.strftime("%Y-%m-%d"),
'Symbol': symbol,
'Company Name': company_name,
'Subject': subject,
'Details': details,
'OpenAI View': openAI_view,
'Broadcast Date/Time': broadcast_date_time,
'PDF Link': pdf_link
}
write_to_csv(data)
except Exception as e:
tb = traceback.format_exc()
logging.warning(f'Exception in TRY BLOCK {str(e)}\nTraceback: {tb}')
if 'too many open files' in str(e).lower():
break
finally:
if driver:
driver.quit()
我运行了 lsof,这是输出:
root@nse-announcements:~# lsof -c python
COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAME
python 120968 root cwd DIR 253,1 4096 787286 /root/nse-announcements
python 120968 root rtd DIR 253,1 4096 2 /
python 120968 root txt REG 253,1 6752256 1967 /usr/bin/python3.11
python 120968 root mem REG 253,1 5026584 790814 /root/nse-announcements/venv/lib/python3.11/site-packages/pydantic_core/_pydantic_core.cpython-311-x86_64-linux-gnu.so
python 120968 root mem REG 253,1 141872 2769 /usr/lib/x86_64-linux-gnu/libgcc_s.so.1
python 120968 root mem REG 253,1 311112 6376 /usr/lib/python3.11/lib-dynload/_decimal.cpython-311-x86_64-linux-gnu.so
python 120968 root mem REG 253,1 264392 790605 /root/nse-announcements/venv/lib/python3.11/site-packages/charset_normalizer/md__mypyc.cpython-311-x86_64-linux-gnu.so
python 120968 root mem REG 253,1 74496 6361 /usr/lib/python3.11/lib-dynload/_asyncio.cpython-311-x86_64-linux-gnu.so
python 120968 root mem REG 253,1 143992 6371 /usr/lib/python3.11/lib-dynload/_ctypes.cpython-311-x86_64-linux-gnu.so
python 120968 root mem REG 253,1 5235544 3132 /usr/lib/x86_64-linux-gnu/libcrypto.so.3
python 120968 root mem REG 253,1 43560 2790 /usr/lib/x86_64-linux-gnu/libffi.so.8.1.2
python 120968 root mem REG 253,1 24336 6381 /usr/lib/python3.11/lib-dynload/_multiprocessing.cpython-311-x86_64-linux-gnu.so
python 120968 root mem REG 253,1 54488 6380 /usr/lib/python3.11/lib-dynload/_multibytecodec.cpython-311-x86_64-linux-gnu.so
python 120968 root mem REG 253,1 202904 3071 /usr/lib/x86_64-linux-gnu/liblzma.so.5.4.1
python 120968 root mem REG 253,1 14712 74280 /usr/lib/x86_64-linux-gnu/librt.so.1
python 120968 root mem REG 253,1 671960 3133 /usr/lib/x86_64-linux-gnu/libssl.so.3
python 120968 root mem REG 253,1 3052896 4166 /usr/lib/locale/locale-archive
python 120968 root mem REG 253,1 74848 3879 /usr/lib/x86_64-linux-gnu/libbz2.so.1.0.4
python 120968 root mem REG 253,1 2105184 71658 /usr/lib/x86_64-linux-gnu/libc.so.6
python 120968 root mem REG 253,1 14480 73974 /usr/lib/x86_64-linux-gnu/libpthread.so.0
python 120968 root mem REG 253,1 45080 6379 /usr/lib/python3.11/lib-dynload/_lzma.cpython-311-x86_64-linux-gnu.so
python 120968 root mem REG 253,1 32088 6362 /usr/lib/python3.11/lib-dynload/_bz2.cpython-311-x86_64-linux-gnu.so
python 120968 root mem REG 253,1 216944 6360 /usr/lib/python3.11/lib-dynload/_ssl.cpython-311-x86_64-linux-gnu.so
python 120968 root mem REG 253,1 63696 6359 /usr/lib/python3.11/lib-dynload/_hashlib.cpython-311-x86_64-linux-gnu.so
python 120968 root mem REG 253,1 14504 6369 /usr/lib/python3.11/lib-dynload/_contextvars.cpython-311-x86_64-linux-gnu.so
python 120968 root mem REG 253,1 35032 1885 /usr/lib/x86_64-linux-gnu/libuuid.so.1.3.0
python 120968 root DEL REG 0,26 1032 /dev/shm/sem.LNVy9P
python 120968 root mem REG 253,1 16064 790602 /root/nse-announcements/venv/lib/python3.11/site-packages/charset_normalizer/md.cpython-311-x86_64-linux-gnu.so
python 120968 root mem REG 253,1 14688 6392 /usr/lib/python3.11/lib-dynload/_uuid.cpython-311-x86_64-linux-gnu.so
python 120968 root mem REG 253,1 49128 6377 /usr/lib/python3.11/lib-dynload/_json.cpython-311-x86_64-linux-gnu.so
python 120968 root mem REG 253,1 23664 6383 /usr/lib/python3.11/lib-dynload/_queue.cpython-311-x86_64-linux-gnu.so
python 120968 root mem REG 253,1 14472 6391 /usr/lib/python3.11/lib-dynload/_typing.cpython-311-x86_64-linux-gnu.so
python 120968 root mem REG 253,1 357584 1908 /usr/lib/locale/C.utf8/LC_CTYPE
python 120968 root mem REG 253,1 174336 5421 /usr/lib/x86_64-linux-gnu/libexpat.so.1.8.10
python 120968 root mem REG 253,1 121200 3887 /usr/lib/x86_64-linux-gnu/libz.so.1.2.13
python 120968 root mem REG 253,1 957008 71661 /usr/lib/x86_64-linux-gnu/libm.so.6
python 120968 root mem REG 253,1 27028 75433 /usr/lib/x86_64-linux-gnu/gconv/gconv-modules.cache
python 120968 root mem REG 253,1 232568 71655 /usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2
python 120968 root 0r CHR 1,3 0t0 5 /dev/null
python 120968 root 1u unix 0xffff9ab807f43b80 0t0 724111 type=STREAM (CONNECTED)
python 120968 root 2u unix 0xffff9ab807f43b80 0t0 724111 type=STREAM (CONNECTED)
python 120968 root 3w REG 253,1 882 793138 /root/nse-announcements/logs-warnings.log
python 120968 root 4u IPv4 724137 0t0 TCP nse-announcements:35066->149.154.167.220:https (ESTABLISHED)
python 120968 root 5u IPv6 724255 0t0 TCP localhost:46688->localhost:60247 (CLOSE_WAIT)
python 120968 root 6u IPv6 725494 0t0 TCP localhost:52052->localhost:38957 (CLOSE_WAIT)
python 120968 root 13u IPv6 730702 0t0 TCP localhost:33460->localhost:60785 (ESTABLISHED)
python 120968 root 14w FIFO 0,14 0t0 728361 pipe
python 120968 root 19r FIFO 0,14 0t0 730626 pipe
root@nse-announcements:~#
python 将 TCP 连接保持在 CLOSE_WAIT 状态。我不知道这些连接是在脚本中的何处创建的。完全神秘。 这里可以做什么,以便脚本无限期地运行,正如它应该的那样。
我在不定 while 循环的每次迭代中创建一个新的驱动程序实例。这就是罪魁祸首。当我只为脚本创建一个驱动程序实例时,问题就停止了。