Python PLaywright Asyncio - 致命错误:堆限制附近的无效标记压缩分配失败 - JavaScript 堆内存不足

问题描述 投票:0回答:1

我正在异步使用 playwright 进行网络抓取。一段时间后运行以下代码时,出现以下错误。我该如何解决这个问题?

<--- Last few GCs --->

[19972:000001D3F1152480]  8825924 ms: Scavenge (reduce) 2044.9 (2082.6) -> 2044.3 (2082.6) MB, 5.5 / 0.0 ms  (average mu = 0.079, current mu = 0.060) allocation failure
 
[19972:000001D3F1152480]  8825937 ms: Scavenge (reduce) 2045.1 (2082.6) -> 2044.5 (2082.6) MB, 5.4/ 0.0 ms  (average mu = 0.079, current mu = 0.060) allocation failure

[19972:000001D3F1152480]  8825954 ms: Scavenge (reduce) 2045.2 (2082.6) -> 2044.6 (2082.8) MB, 5.3 / 0.0 ms  (average mu = 0.079, current mu = 0.060) allocation failure 


<--- JS stacktrace --->

FATAL ERROR: Ineffective mark-compacts near heap limit Allocation failed - JavaScript heap out of memory

 1: 00007FF75C2D013F v8::internal::CodeObjectRegistry::~CodeObjectRegistry+112495
 2: 00007FF75C25F396 DSA_meth_get_flags+65526
 3: 00007FF75C26024D node::OnFatalError+301
 4: 00007FF75CB919EE v8::Isolate::ReportExternalAllocationLimitReached+94
 5: 00007FF75CB7BECD v8::SharedArrayBuffer::Externalize+781
 6: 00007FF75CA1F61C v8::internal::Heap::EphemeronKeyWriteBarrierFromCode+1468
 7: 00007FF75CA2C2C9 v8::internal::Heap::PublishPendingAllocations+1129
 8: 00007FF75CA2929A v8::internal::Heap::PageFlagsAreConsistent+2842
 9: 00007FF75CA1BF19 v8::internal::Heap::CollectGarbage+2137
10: 00007FF75CA1A0D0 v8::internal::Heap::AllocateExternalBackingStore+2000
11: 00007FF75CA37D40 v8::internal::FreeListManyCached::Reset+1408
12: 00007FF75CA383F5 v8::internal::Factory::AllocateRaw+37
13: 00007FF75CA4D83B v8::internal::FactoryBase<v8::internal::Factory>::NewRawOneByteString+75
14: 00007FF75CA46089 v8::internal::Factory::NewStringFromOneByte+121
15: 00007FF75CB8DFBE v8::String::NewFromOneByte+222
16: 00007FF75C17537D v8::internal::OrderedNameDictionary::OrderedNameDictionary+36637
17: 00007FF75C174EC1 v8::internal::OrderedNameDictionary::OrderedNameDictionary+35425
18: 00007FF75C174ADE v8::internal::OrderedNameDictionary::OrderedNameDictionary+34430
19: 00007FF75C27AFCB v8::internal::Malloced::operator delete+4651
20: 00007FF75CBB3ACF v8::internal::SetupIsolateDelegate::SetupHeap+53823
21: 000001D3F31F9B92 
import asyncio

import itertools
from playwright.async_api import async_playwright, Playwright
from playwright.async_api import expect
import openpyxl as O

timeout_citations_overview = 20000
before_citations_timeout = 10000
Excel_worksheet_general = "General"
Excel_worksheet_authors = "Authors"
path_to_excel_file = ""

Excel_file = path_to_excel_file
wb = O.load_workbook(Excel_file)
ws_general = wb[Excel_worksheet_general]
ws_authors = wb[Excel_worksheet_authors]

row_num = ws_general.max_row
col_num = ws_general.max_column

liste = []

for row in ws_general.values:
    liste.append(list(filter(None, row)))

import time

next_delay = 1


async def main():
    async with async_playwright() as p:
        # browser = await p.chromium.launch(headless=True, args=["--window-size=1920,1080", "--disable-gpu","--no-sandbox", "--ignore-certificate-error" ])
        # context = await browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36")
        browser = await p.chromium.launch(headless=False)
        context = await browser.new_context()
        page = await context.new_page()
        await page.goto("https://www.scopus.com/search/form.uri?")

        laist = liste[1][1:]

        length_of_laist = len(laist)
        var = []
        update_selection = []
        initial_value = []
        later_value = []
        less_than = []
        citations_before = []
        year = liste[1][0]

        async def x(i, j):
            laist = liste[j][1:]

            waittime = 4
            if i >= 1:
                await asyncio.sleep(waittime)
            if i >= 2:
                await asyncio.sleep(waittime)
            if i >= 3:
                await asyncio.sleep(waittime)
            if i >= 4:
                await asyncio.sleep(waittime)
            if i >= 5:
                await asyncio.sleep(waittime)
            if i >= 6:
                await asyncio.sleep(waittime)
            if i >= 7:
                await asyncio.sleep(waittime)
            if i >= 8:
                await asyncio.sleep(waittime)
            if i >= 9:
                await asyncio.sleep(waittime)
            if i >= 10:
                await asyncio.sleep(waittime)

            # print(laist)

            year = liste[j][0]

            var.append(i)
            update_selection.append(i)
            initial_value.append(i)
            later_value.append(i)
            less_than.append(i)
            citations_before.append(i)

            var[i] = await context.new_page()

            try:
                try:
                    await var[i].goto(f"https://www.scopus.com/authid/detail.uri?authorId={laist[i]}&origin=recordpage")
                    open_citations_overview = await var[i].wait_for_selector(
                        "id=AuthorProfilePage_CitationOverviewLink")
                    await open_citations_overview.click()
                    initial_value[i][i] = await var[i].wait_for_selector("//div[@id='lessThan']//span")

                    await var[i].locator("#startYear-button span").first.click(timeout=timeout_citations_overview)
                    await var[i].locator(f"div[role=\"option\"]:has-text(\"{(int(year))}\")").click()
                    update_selection[i] = await var[i].wait_for_selector("id=updateOverviewButtonOn")
                    await update_selection[i].click()

                    locator = var[i].locator("//span[@class='fontSmall previousYearsHeader']")
                    await expect(locator).to_contain_text(f"<{year}", timeout=40000)
                    later_value[i] = await var[i].wait_for_selector("//div[@id='lessThan']//span")

                    while await initial_value[i].inner_text() == await later_value[i].inner_text():
                        await asyncio.sleep(0.5)
                        later_value[i] = await var[i].wait_for_selector("//div[@id='lessThan']//span")
                        
                    try:
                        less_than[i] = await var[i].wait_for_selector("//div[@id='lessThan']//span",
                                                                      timeout=before_citations_timeout)
                    except:
                        less_than[i] = await var[i].wait_for_selector("//div[@id='lessThan']")

                    citations_before[i] = await less_than[i].inner_text()
                    print(citations_before[i])
                    ws_authors.cell(j, i + 1).value = citations_before[i]
                    await var[i].close()
                    return i
               except:
                
                pass 


       

            except:
                await var[i].close()
                pass

        for j in range(3, row_num):
            wb.save(Excel_file)
            wb.close()
            var = []
            update_selection = []
            initial_value = []
            later_value = []
            less_than = []
            citations_before = []
            laist = liste[j][1:]

            length_of_laist = len(laist)

            for f in asyncio.as_completed([x(i, j=j) for i in range(0, length_of_laist)]):
                result = await f

        await browser.close()


asyncio.run(main())
python web-scraping fatal-error playwright
1个回答
0
投票
from playwright.sync_api import Playwright, async_playwright


async def run(playwright: Playwright) -> None:
    browser = await playwright.chromium.launch(
        headless=False,
        extra_browser_args=["--disable-dev-shm-usage", "--disable-gpu", "--disable-setuid-sandbox", "--no-sandbox", "--disable-infobars", "--disable-extensions", "--disable-web-security", "--disable-site-isolation-trials", "--shm-size=1gb"]
    )
© www.soinside.com 2019 - 2024. All rights reserved.