“TCP localhost:57318->localhost:34033 (CLOSE_WAIT)”和“打开文件太多”

问题描述 投票:0回答:1

我有一个 python 脚本,它使用 selenium 在 ubuntu 服务器上进行网页抓取。我在每次迭代的 while 循环中创建新的驱动程序对象,并在使用后退出它。 该脚本在一段时间内运行良好。然后它停止创建新驱动程序,但出现异常“打开文件太多”。

这是脚本:

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import requests
import os
import logging
from PyPDF2 import PdfReader
from webdriver_manager.chrome import ChromeDriverManager
from openai import OpenAI
import undetected_chromedriver as uc
import time
import shutil
import glob
import logging
import csv
import traceback



logging.basicConfig(format='%(asctime)s - %(name)s - %(process)d - %(threadName)s - %(levelname)s - %(message)s',
                        level=logging.WARNING,
                        handlers=[logging.FileHandler("/root/nse-announcements/logs-warnings.log"),
                                  logging.StreamHandler()])

url = 'https://www.nseindia.com/companies-listing/corporate-filings-announcements'
bot_token = '7069953058:AAGsJ-hihPjME'
bot_chatID = os.getenv('TELEGRAM_BOT_CHAT_ID')
openai_api_key = os.getenv('OPENAI_API_KEY')
openAI_view = ''
first_page_text = ''
path_to_pdf = ''
pdf_link = ''
subject = ''

pdf_link_file_path = '/root/nse-announcements/announcements_pdf_links.txt'
temp_pdf = '/root/nse-announcements/temp.pdf'
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
download_directory = "/root/nse-announcements/pdfs-of-announcements"
csv_file_path = '/root/nse-announcements/nse_announcements.csv'
csv_headers = ['Date', 'Symbol', 'Company Name', 'Subject', 'Details', 'OpenAI View', 'Broadcast Date/Time', 'PDF Link']

with requests.Session() as session:
    response = session.get(f'https://api.telegram.org/bot{bot_token}/sendMessage', 
                params={'chat_id': bot_chatID, 'text': "Hello Unix User! I'm now active and ready to fetch NSE announcements for you."})
    


def clear_tmp_directory():
    tmp_path = '/tmp'

    tmp_dirs = glob.glob(f"{tmp_path}/tmp*" )
    for directory in tmp_dirs:
        try:
            shutil.rmtree(directory)
        except Exception as e:
            pass

def clean_scoped_directories():
    tmp_path = '/tmp'

    scoped_dirs = glob.glob(f"{tmp_path}/scoped_dir*")
    for directory in scoped_dirs:
        try:
            shutil.rmtree(directory)
        except Exception as e:
            pass
            # logging.debug(f"Failed to remove {directory}: {str(e)}")


    chrome_items = glob.glob(f"{tmp_path}/.com.google.Chrome.*")
    for item in chrome_items:
        try:
            if os.path.isdir(item):
                shutil.rmtree(item)
            else:
                os.remove(item)
        except Exception as e:
            # Check if the error is because the item is not a directory
            if "Errno 20" in str(e):
                try:
                    os.remove(item)  # Attempt to delete it as a file
                except Exception as e:
                    pass
                    # logging.debug(f"Failed to remove {item} as a file: {e}")
            # else:
                # logging.debug(f"Failed to remove {item}: {e}")

def clean_text(text):
    return text.replace('&', '').replace('<', '').replace('>', '').replace('"', '').replace("'", "")

def send_telegram_message(data):
    message = f"<b>Symbol:</b> {data['Symbol']}\n" \
              f"<b>{data['pref_alert']}</b>\n" \
              f"<b>Company Name:</b> {data['Company Name']}\n" \
              f"<b>Broadcast Date/Time:</b> {data['Broadcast Date/Time']}\n" \
              f"<b>Subject:</b> {data['Subject']}\n" \
              f"<b>Details:</b> {data['Details']}\n" \
              f"<b>OpenAI:</b> {data['OpenAI']}\n" \
              f"<b>PDF Link:</b> {data['PDF Link']}"

    url = f'https://api.telegram.org/bot{bot_token}/sendMessage'
    params = {
        'chat_id': bot_chatID,
        'parse_mode': 'HTML',
        'text': message
    }
    max_attempts = 30  # Maximum number of attempts to send the message
    attempt = 0
    connect_timeout, read_timeout = 10, 30  # Timeouts
    delay_seconds = 1  # Initial delay for retries

    while attempt < max_attempts:
        try:
            with requests.Session() as session:
                response = session.get(url, params=params, timeout=(connect_timeout, read_timeout))
                if response.status_code == 200:
                    return True
                elif response.status_code in {301, 302, 307, 308}:
                    # Handle redirection, if needed
                    url = response.headers['Location']
                elif 400 <= response.status_code < 500:
                    logging.warning(f'Client error: {response.status_code} - {response.text}')
                    break  # Stop retrying for client errors
                elif 500 <= response.status_code < 600:
                    logging.warning(f'Server error: {response.status_code} - {response.text}')
                    # Consider retrying or handling server-side errors
                else:
                    logging.warning(f'Unhandled status code: {response.status_code}')
        except Exception as e:
            logging.warning(f'Exception in send_telegram_message: {str(e)}')
            time.sleep(delay_seconds)
            delay_seconds *= 2  # Exponential backoff
        finally:
            attempt += 1  # Ensure attempt is incremented

    return False

def wait_for_download(path_to_pdf):
    timeout = 5  # Maximum time to wait for the download to appear (in seconds)
    start_time = time.time()
    while True:
        elapsed_time = time.time() - start_time
        if os.path.exists(path_to_pdf) and os.path.getsize(path_to_pdf) > 0:
            return True
        if elapsed_time > timeout:
            return False
        time.sleep(1)

def process_pdf(pdf_link):
    global path_to_pdf, openAI_view
    driver = None
    try:
        chrome_options = uc.ChromeOptions()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument('log-level=3')
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--disable-webgl")
        chrome_options.add_experimental_option('prefs', {
            "download.default_directory": download_directory,
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "plugins.always_open_pdf_externally": True
        })
        driver = uc.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
        time.sleep(2)
        driver.execute_cdp_cmd('Network.setUserAgentOverride', {'userAgent': user_agent})

        pdf_file_name = pdf_link.split('/')[-1]
        path_to_pdf = os.path.join(download_directory, pdf_file_name)

        for attempt in range(10):  # Retry attempts
            driver.get(pdf_link)
            if wait_for_download(path_to_pdf):
                with open(path_to_pdf, 'rb') as file:
                    reader = PdfReader(file)
                    first_page = reader.pages[0]
                    first_page_text = first_page.extract_text()
                    if first_page_text:
                        openAI_view = 'first_page_text'
                        return first_page_text
                    else:
                        logging.warning(f'PDF {pdf_link} first page contains image only content or no text.')
                        openAI_view = 'image PDF'
                os.remove(path_to_pdf)  # Move deletion outside of the try-except
                return False
            time.sleep(5)  # Wait 5 seconds before the next retry

        
        logging.warning(f"Failed to fetch PDF {pdf_link} after multiple attempts.")
        openAI_view = 'failed to fetch PDF'
        return False

    except Exception as e:
        logging.warning(f'Exception in process_pdf: {str(e)}')
        openAI_view = 'error in processing PDF'
        return False

    finally:
        if driver:
            driver.quit()


def summarize_text(text):
    client = OpenAI(api_key=openai_api_key)
    
    if client.api_key is None:
        return "API key not found in environment variables"
    
    system_msg = 'You know what is most relevant and important for the investors of a company.'
    user_msg = f'Summarize important information (ignore addresses) in company announcement text: {text}'
    # system_msg = 'Identify and summarize key details from company announcements related specifically to fund raising through equity shares, warrants, or other securities. Focus on details such as the type of offering, amount being raised, type of investors, and any relevant approvals or voting outcomes. Ignore routine operational details and addresses.'
    # user_msg = f'Analyze this announcement text and summarize any sections related to fund raising through equity shares or warrants: {text}'


    attempts = 0
    max_attempts = 3  # Set the number of attempts to 3

    while attempts < max_attempts:
        attempts += 1
        # Create a chat completion using the client
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "system", "content": system_msg},
                {"role": "user", "content": user_msg}
            ],
            model="gpt-3.5-turbo",
            temperature=0.3,
            max_tokens=150,
        )

        reason = chat_completion.choices[0].finish_reason

        # Handling for the 'stop' finish reason
        if reason == 'stop':
            return chat_completion.choices[0].message.content

        # Handling for the 'length' finish reason (output truncated due to max tokens)
        elif reason == 'length':
            return f"Output truncated: {chat_completion.choices[0].message.content}"

        # Handling for the 'function_call' finish reason (model initiated a function call)
        elif reason == 'function_call':
            return "Function call initiated by the model."

        # Handling for the 'content_filter' finish reason (content omitted by filters)
        elif reason == 'content_filter':
            return "Content filtered due to safety protocols."

        # Handling for the 'null' finish reason (response still in progress or incomplete)
        elif reason == 'null':
            return "Response still in progress or incomplete."
    
        time.sleep(2)
    return "Failed to summarize text after maximum attempts."

def write_to_csv(data):
    with open(csv_file_path, 'a', newline='') as csvfile:  # 'a' mode to append to the file
        writer = csv.DictWriter(csvfile, fieldnames=csv_headers)
        writer.writerow(data)


existing_links = set()
# Load existing links from file if it exists
try:
    with open(pdf_link_file_path, 'r') as file:
        existing_links = set(line.strip() for line in file)
except FileNotFoundError:
    pass

try:
    with open(csv_file_path, 'x', newline='') as csvfile:  # 'x' mode to create and fail if exists
        writer = csv.DictWriter(csvfile, fieldnames=csv_headers)
        writer.writeheader()
except FileExistsError:
    pass

avoidable_keywords = ['74(5)', 'GST authority', 'delisting']

avoidable_subjects_details = ['Analysts/Institutional', 'FDA Inspection']


while True:
    driver = None
    try:
        clean_scoped_directories()
        clear_tmp_directory()
        chrome_options = uc.ChromeOptions()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument('log-level=3')
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--disable-webgl")
        chrome_options.add_experimental_option('prefs', {
            "download.default_directory": download_directory,
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "plugins.always_open_pdf_externally": True
            })
        driver = uc.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options) 
        time.sleep(2)
        driver.execute_cdp_cmd('Network.setUserAgentOverride', {'userAgent': user_agent})
        wait = WebDriverWait(driver, 20)
        driver.get(url)
        time.sleep(5)
        wait.until(EC.presence_of_element_located(
                (By.XPATH, '//table[@id="CFanncEquityTable"]/tbody/tr[1]/td[6]')))
        html_source = driver.page_source
        soup = BeautifulSoup(html_source, 'html.parser')
        table = soup.find('table', {'id': 'CFanncEquityTable'})
        driver.quit()
        if not table or table.find_all('a', href=True) == []:
            continue

        for row in table.find_all('tr'): 
            openAI_view = ''
            first_page_text = ''
            path_to_pdf = ''
            pdf_link = ''
            subject = ''
            pref_alert = ''
            cells = row.find_all('td')
            if len(cells) > 4:
                pdf_link = cells[4].find('a', href=True)['href'] if cells[4].find('a', href=True) and cells[4].find('a', href=True)['href'].endswith('.pdf') else None
                subject = clean_text(cells[2].text.strip())
                details = clean_text(cells[3].text.strip())
                company_name = clean_text(cells[1].text.strip())
                broadcast_date_time = cells[5].text.strip().split("Exchange")[0].strip()
                symbol = cells[0].text.strip()
                original_symbol = symbol
                symbol = clean_text(symbol)
                if symbol != original_symbol:
                    symbol = symbol + " (Edited)"

                if pdf_link and pdf_link not in existing_links:
                    found_subject = None
                    for keyword in avoidable_subjects_details:
                        if keyword.lower() in subject.lower() or keyword.lower() in details.lower():
                            found_subject = keyword
                            break

                    if found_subject is not None:
                        logging.warning(f'************** SKIPPING PDF SINCE IT CONTAINS THE AVOIDABLE SUBJECT/DETAIL: {found_subject} ***************')
                        existing_links.add(pdf_link)
                        with open(pdf_link_file_path, 'a') as file:
                            file.write(pdf_link + '\n')
                        data = {
                            'Date': time.strftime("%Y-%m-%d"), 
                            'Symbol': symbol,
                            'Company Name': company_name,
                            'Subject': subject,
                            'Details': details,
                            'OpenAI View': 'SKIP',
                            'Broadcast Date/Time': broadcast_date_time,
                            'PDF Link': pdf_link
                        }
                        write_to_csv(data)
                        continue

                    first_page_text = process_pdf(pdf_link)
                    if first_page_text:
                        os.remove(path_to_pdf)
                        found_keyword = None
                        preferential_issue_keywords = ['preferential issue', 'equity shares', 'convertible warrants', 'raising funds']
                        fund_raising_found = 'preferential' in first_page_text.lower() and any(keyword in first_page_text.lower() for keyword in preferential_issue_keywords)
                        if fund_raising_found:
                            pref_alert = '*** PREFERENTIAL ISSUE ALERT ***'
                        else:
                            for keyword in avoidable_keywords:
                                if keyword.lower() in first_page_text.lower():
                                    found_keyword = keyword
                                    break

                            if found_keyword is not None:
                                logging.warning(f'************** SKIPPING PDF SINCE IT CONTAINS THE AVOIDABLE KEYWORD: {found_keyword} ***************')
                                existing_links.add(pdf_link)
                                with open(pdf_link_file_path, 'a') as file:
                                    file.write(pdf_link + '\n')
                                data = {
                                    'Date': time.strftime("%Y-%m-%d"), 
                                    'Symbol': symbol,
                                    'Company Name': company_name,
                                    'Subject': subject,
                                    'Details': details,
                                    'OpenAI View': 'SKIP',
                                    'Broadcast Date/Time': broadcast_date_time,
                                    'PDF Link': pdf_link
                                }
                                write_to_csv(data)
                                continue

                        openAI_view = summarize_text(first_page_text)

                    data = {
                        'Symbol': symbol,
                        'Company Name': company_name,
                        'Subject': subject,
                        'Details': details,
                        'OpenAI': openAI_view,
                        'Broadcast Date/Time': broadcast_date_time,
                        'PDF Link': pdf_link
                    }
                    should_link_be_saved = send_telegram_message(data)
                    if should_link_be_saved:
                        existing_links.add(pdf_link)
                        with open(pdf_link_file_path, 'a') as file:
                            file.write(pdf_link + '\n')

                data = {
                        'Date': time.strftime("%Y-%m-%d"), 
                        'Symbol': symbol,
                        'Company Name': company_name,
                        'Subject': subject,
                        'Details': details,
                        'OpenAI View': openAI_view,
                        'Broadcast Date/Time': broadcast_date_time,
                        'PDF Link': pdf_link
                    }
                write_to_csv(data)

    except Exception as e:
        tb = traceback.format_exc()
        logging.warning(f'Exception in TRY BLOCK {str(e)}\nTraceback: {tb}')
        if 'too many open files' in str(e).lower():
                break
    finally:
        if driver:
            driver.quit()

我运行了 lsof,这是输出:

root@nse-announcements:~# lsof -c python
COMMAND    PID USER   FD   TYPE             DEVICE SIZE/OFF   NODE NAME
python  120968 root  cwd    DIR              253,1     4096 787286 /root/nse-announcements
python  120968 root  rtd    DIR              253,1     4096      2 /
python  120968 root  txt    REG              253,1  6752256   1967 /usr/bin/python3.11
python  120968 root  mem    REG              253,1  5026584 790814 /root/nse-announcements/venv/lib/python3.11/site-packages/pydantic_core/_pydantic_core.cpython-311-x86_64-linux-gnu.so
python  120968 root  mem    REG              253,1   141872   2769 /usr/lib/x86_64-linux-gnu/libgcc_s.so.1
python  120968 root  mem    REG              253,1   311112   6376 /usr/lib/python3.11/lib-dynload/_decimal.cpython-311-x86_64-linux-gnu.so
python  120968 root  mem    REG              253,1   264392 790605 /root/nse-announcements/venv/lib/python3.11/site-packages/charset_normalizer/md__mypyc.cpython-311-x86_64-linux-gnu.so
python  120968 root  mem    REG              253,1    74496   6361 /usr/lib/python3.11/lib-dynload/_asyncio.cpython-311-x86_64-linux-gnu.so
python  120968 root  mem    REG              253,1   143992   6371 /usr/lib/python3.11/lib-dynload/_ctypes.cpython-311-x86_64-linux-gnu.so
python  120968 root  mem    REG              253,1  5235544   3132 /usr/lib/x86_64-linux-gnu/libcrypto.so.3
python  120968 root  mem    REG              253,1    43560   2790 /usr/lib/x86_64-linux-gnu/libffi.so.8.1.2
python  120968 root  mem    REG              253,1    24336   6381 /usr/lib/python3.11/lib-dynload/_multiprocessing.cpython-311-x86_64-linux-gnu.so
python  120968 root  mem    REG              253,1    54488   6380 /usr/lib/python3.11/lib-dynload/_multibytecodec.cpython-311-x86_64-linux-gnu.so
python  120968 root  mem    REG              253,1   202904   3071 /usr/lib/x86_64-linux-gnu/liblzma.so.5.4.1
python  120968 root  mem    REG              253,1    14712  74280 /usr/lib/x86_64-linux-gnu/librt.so.1
python  120968 root  mem    REG              253,1   671960   3133 /usr/lib/x86_64-linux-gnu/libssl.so.3
python  120968 root  mem    REG              253,1  3052896   4166 /usr/lib/locale/locale-archive
python  120968 root  mem    REG              253,1    74848   3879 /usr/lib/x86_64-linux-gnu/libbz2.so.1.0.4
python  120968 root  mem    REG              253,1  2105184  71658 /usr/lib/x86_64-linux-gnu/libc.so.6
python  120968 root  mem    REG              253,1    14480  73974 /usr/lib/x86_64-linux-gnu/libpthread.so.0
python  120968 root  mem    REG              253,1    45080   6379 /usr/lib/python3.11/lib-dynload/_lzma.cpython-311-x86_64-linux-gnu.so
python  120968 root  mem    REG              253,1    32088   6362 /usr/lib/python3.11/lib-dynload/_bz2.cpython-311-x86_64-linux-gnu.so
python  120968 root  mem    REG              253,1   216944   6360 /usr/lib/python3.11/lib-dynload/_ssl.cpython-311-x86_64-linux-gnu.so
python  120968 root  mem    REG              253,1    63696   6359 /usr/lib/python3.11/lib-dynload/_hashlib.cpython-311-x86_64-linux-gnu.so
python  120968 root  mem    REG              253,1    14504   6369 /usr/lib/python3.11/lib-dynload/_contextvars.cpython-311-x86_64-linux-gnu.so
python  120968 root  mem    REG              253,1    35032   1885 /usr/lib/x86_64-linux-gnu/libuuid.so.1.3.0
python  120968 root  DEL    REG               0,26            1032 /dev/shm/sem.LNVy9P
python  120968 root  mem    REG              253,1    16064 790602 /root/nse-announcements/venv/lib/python3.11/site-packages/charset_normalizer/md.cpython-311-x86_64-linux-gnu.so
python  120968 root  mem    REG              253,1    14688   6392 /usr/lib/python3.11/lib-dynload/_uuid.cpython-311-x86_64-linux-gnu.so
python  120968 root  mem    REG              253,1    49128   6377 /usr/lib/python3.11/lib-dynload/_json.cpython-311-x86_64-linux-gnu.so
python  120968 root  mem    REG              253,1    23664   6383 /usr/lib/python3.11/lib-dynload/_queue.cpython-311-x86_64-linux-gnu.so
python  120968 root  mem    REG              253,1    14472   6391 /usr/lib/python3.11/lib-dynload/_typing.cpython-311-x86_64-linux-gnu.so
python  120968 root  mem    REG              253,1   357584   1908 /usr/lib/locale/C.utf8/LC_CTYPE
python  120968 root  mem    REG              253,1   174336   5421 /usr/lib/x86_64-linux-gnu/libexpat.so.1.8.10
python  120968 root  mem    REG              253,1   121200   3887 /usr/lib/x86_64-linux-gnu/libz.so.1.2.13
python  120968 root  mem    REG              253,1   957008  71661 /usr/lib/x86_64-linux-gnu/libm.so.6
python  120968 root  mem    REG              253,1    27028  75433 /usr/lib/x86_64-linux-gnu/gconv/gconv-modules.cache
python  120968 root  mem    REG              253,1   232568  71655 /usr/lib/x86_64-linux-gnu/ld-linux-x86-64.so.2
python  120968 root    0r   CHR                1,3      0t0      5 /dev/null
python  120968 root    1u  unix 0xffff9ab807f43b80      0t0 724111 type=STREAM (CONNECTED)
python  120968 root    2u  unix 0xffff9ab807f43b80      0t0 724111 type=STREAM (CONNECTED)
python  120968 root    3w   REG              253,1      882 793138 /root/nse-announcements/logs-warnings.log
python  120968 root    4u  IPv4             724137      0t0    TCP nse-announcements:35066->149.154.167.220:https (ESTABLISHED)
python  120968 root    5u  IPv6             724255      0t0    TCP localhost:46688->localhost:60247 (CLOSE_WAIT)
python  120968 root    6u  IPv6             725494      0t0    TCP localhost:52052->localhost:38957 (CLOSE_WAIT)
python  120968 root   13u  IPv6             730702      0t0    TCP localhost:33460->localhost:60785 (ESTABLISHED)
python  120968 root   14w  FIFO               0,14      0t0 728361 pipe
python  120968 root   19r  FIFO               0,14      0t0 730626 pipe
root@nse-announcements:~#

python 将 TCP 连接保持在 CLOSE_WAIT 状态。我不知道这些连接是在脚本中的何处创建的。完全神秘。 这里可以做什么,以便脚本无限期地运行,正如它应该的那样。

python selenium-webdriver tcp
1个回答
0
投票

我在不定 while 循环的每次迭代中创建一个新的驱动程序实例。这就是罪魁祸首。当我只为脚本创建一个驱动程序实例时,问题就停止了。

© www.soinside.com 2019 - 2024. All rights reserved.