我正在异步使用 playwright 进行网络抓取。一段时间后运行以下代码时,出现以下错误。我该如何解决这个问题?
<--- Last few GCs --->
[19972:000001D3F1152480] 8825924 ms: Scavenge (reduce) 2044.9 (2082.6) -> 2044.3 (2082.6) MB, 5.5 / 0.0 ms (average mu = 0.079, current mu = 0.060) allocation failure
[19972:000001D3F1152480] 8825937 ms: Scavenge (reduce) 2045.1 (2082.6) -> 2044.5 (2082.6) MB, 5.4/ 0.0 ms (average mu = 0.079, current mu = 0.060) allocation failure
[19972:000001D3F1152480] 8825954 ms: Scavenge (reduce) 2045.2 (2082.6) -> 2044.6 (2082.8) MB, 5.3 / 0.0 ms (average mu = 0.079, current mu = 0.060) allocation failure
<--- JS stacktrace --->
FATAL ERROR: Ineffective mark-compacts near heap limit Allocation failed - JavaScript heap out of memory
1: 00007FF75C2D013F v8::internal::CodeObjectRegistry::~CodeObjectRegistry+112495
2: 00007FF75C25F396 DSA_meth_get_flags+65526
3: 00007FF75C26024D node::OnFatalError+301
4: 00007FF75CB919EE v8::Isolate::ReportExternalAllocationLimitReached+94
5: 00007FF75CB7BECD v8::SharedArrayBuffer::Externalize+781
6: 00007FF75CA1F61C v8::internal::Heap::EphemeronKeyWriteBarrierFromCode+1468
7: 00007FF75CA2C2C9 v8::internal::Heap::PublishPendingAllocations+1129
8: 00007FF75CA2929A v8::internal::Heap::PageFlagsAreConsistent+2842
9: 00007FF75CA1BF19 v8::internal::Heap::CollectGarbage+2137
10: 00007FF75CA1A0D0 v8::internal::Heap::AllocateExternalBackingStore+2000
11: 00007FF75CA37D40 v8::internal::FreeListManyCached::Reset+1408
12: 00007FF75CA383F5 v8::internal::Factory::AllocateRaw+37
13: 00007FF75CA4D83B v8::internal::FactoryBase<v8::internal::Factory>::NewRawOneByteString+75
14: 00007FF75CA46089 v8::internal::Factory::NewStringFromOneByte+121
15: 00007FF75CB8DFBE v8::String::NewFromOneByte+222
16: 00007FF75C17537D v8::internal::OrderedNameDictionary::OrderedNameDictionary+36637
17: 00007FF75C174EC1 v8::internal::OrderedNameDictionary::OrderedNameDictionary+35425
18: 00007FF75C174ADE v8::internal::OrderedNameDictionary::OrderedNameDictionary+34430
19: 00007FF75C27AFCB v8::internal::Malloced::operator delete+4651
20: 00007FF75CBB3ACF v8::internal::SetupIsolateDelegate::SetupHeap+53823
21: 000001D3F31F9B92
import asyncio
import itertools
from playwright.async_api import async_playwright, Playwright
from playwright.async_api import expect
import openpyxl as O
timeout_citations_overview = 20000
before_citations_timeout = 10000
Excel_worksheet_general = "General"
Excel_worksheet_authors = "Authors"
path_to_excel_file = ""
Excel_file = path_to_excel_file
wb = O.load_workbook(Excel_file)
ws_general = wb[Excel_worksheet_general]
ws_authors = wb[Excel_worksheet_authors]
row_num = ws_general.max_row
col_num = ws_general.max_column
liste = []
for row in ws_general.values:
liste.append(list(filter(None, row)))
import time
next_delay = 1
async def main():
async with async_playwright() as p:
# browser = await p.chromium.launch(headless=True, args=["--window-size=1920,1080", "--disable-gpu","--no-sandbox", "--ignore-certificate-error" ])
# context = await browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36")
browser = await p.chromium.launch(headless=False)
context = await browser.new_context()
page = await context.new_page()
await page.goto("https://www.scopus.com/search/form.uri?")
laist = liste[1][1:]
length_of_laist = len(laist)
var = []
update_selection = []
initial_value = []
later_value = []
less_than = []
citations_before = []
year = liste[1][0]
async def x(i, j):
laist = liste[j][1:]
waittime = 4
if i >= 1:
await asyncio.sleep(waittime)
if i >= 2:
await asyncio.sleep(waittime)
if i >= 3:
await asyncio.sleep(waittime)
if i >= 4:
await asyncio.sleep(waittime)
if i >= 5:
await asyncio.sleep(waittime)
if i >= 6:
await asyncio.sleep(waittime)
if i >= 7:
await asyncio.sleep(waittime)
if i >= 8:
await asyncio.sleep(waittime)
if i >= 9:
await asyncio.sleep(waittime)
if i >= 10:
await asyncio.sleep(waittime)
# print(laist)
year = liste[j][0]
var.append(i)
update_selection.append(i)
initial_value.append(i)
later_value.append(i)
less_than.append(i)
citations_before.append(i)
var[i] = await context.new_page()
try:
try:
await var[i].goto(f"https://www.scopus.com/authid/detail.uri?authorId={laist[i]}&origin=recordpage")
open_citations_overview = await var[i].wait_for_selector(
"id=AuthorProfilePage_CitationOverviewLink")
await open_citations_overview.click()
initial_value[i][i] = await var[i].wait_for_selector("//div[@id='lessThan']//span")
await var[i].locator("#startYear-button span").first.click(timeout=timeout_citations_overview)
await var[i].locator(f"div[role=\"option\"]:has-text(\"{(int(year))}\")").click()
update_selection[i] = await var[i].wait_for_selector("id=updateOverviewButtonOn")
await update_selection[i].click()
locator = var[i].locator("//span[@class='fontSmall previousYearsHeader']")
await expect(locator).to_contain_text(f"<{year}", timeout=40000)
later_value[i] = await var[i].wait_for_selector("//div[@id='lessThan']//span")
while await initial_value[i].inner_text() == await later_value[i].inner_text():
await asyncio.sleep(0.5)
later_value[i] = await var[i].wait_for_selector("//div[@id='lessThan']//span")
try:
less_than[i] = await var[i].wait_for_selector("//div[@id='lessThan']//span",
timeout=before_citations_timeout)
except:
less_than[i] = await var[i].wait_for_selector("//div[@id='lessThan']")
citations_before[i] = await less_than[i].inner_text()
print(citations_before[i])
ws_authors.cell(j, i + 1).value = citations_before[i]
await var[i].close()
return i
except:
pass
except:
await var[i].close()
pass
for j in range(3, row_num):
wb.save(Excel_file)
wb.close()
var = []
update_selection = []
initial_value = []
later_value = []
less_than = []
citations_before = []
laist = liste[j][1:]
length_of_laist = len(laist)
for f in asyncio.as_completed([x(i, j=j) for i in range(0, length_of_laist)]):
result = await f
await browser.close()
asyncio.run(main())
from playwright.sync_api import Playwright, async_playwright
async def run(playwright: Playwright) -> None:
browser = await playwright.chromium.launch(
headless=False,
extra_browser_args=["--disable-dev-shm-usage", "--disable-gpu", "--disable-setuid-sandbox", "--no-sandbox", "--disable-infobars", "--disable-extensions", "--disable-web-security", "--disable-site-isolation-trials", "--shm-size=1gb"]
)