我想向下滚动,让我们说至少直到 72 个帖子在 TikTok 用户页面(例如thebeatles 页面)上可见,然后保存 HTML 内容。该页面目前有 73 个帖子。只要可见帖子的数量低于 72 但只返回 30 个帖子,我的目标是通过使用
page.keyboard.press("End")
按下“结束”键来实现向下滚动。获取过程中止图像和字体请求以进行优化。下面是代码test_fetch_raw_source.py
和helper.py
,其中后者只包含优化功能。 如何向下滚动到第 n 个,在本例中为第 72 个可见帖子?
# test_fetch_raw_source.py
def test_fetch_front_page_with_n_latest_posts():
# Fetch front page with n_count of visible posts
usr = r"thebeatles"
url = "".join([r"https://www.tiktok.com/@", usr])
n = 72 # The most current posts count
def get_visible_post_count(page):
# Get visible post count
post_urls = page.locator("//div[@data-e2e='user-post-item']//a")
post_urls_count = len(post_urls.all())
return post_urls_count
def press_end(url_n):
if not url_n:
page.keyboard.press("End")
with sync_playwright() as playwright:
browser = playwright.chromium.launch()
context = browser.new_context()
page = context.new_page()
helper.logging_network_events(page) # Logging
# Abort requests
helper.abort_image(page)
helper.abort_image_by_ext(page)
helper.abort_font_by_ext(page)
# Get page
page.goto(url, timeout=0)
# If nth post element is not visible keep pressing key "End"
url_n = page.locator("//div[@data-e2e='user-post-item']//a").nth(n-1).is_visible(timeout=0)
for i in range(n):
press_end(url_n)
# Post count
post_count = get_visible_post_count(page)
print(post_count)
# Save html
usr_rel_path = "".join(["html/", usr, "/"])
path_exists = os.path.exists(usr_rel_path)
if not path_exists:
os.makedirs(usr_rel_path)
usr_folder = "".join([os.getcwd(), "/", usr_rel_path])
file_name = "".join([usr, ".html"])
file_dir = "".join([usr_folder, file_name])
html_source = page.content()
with open(file_dir, "w") as file:
file.write(html_source)
file.close()
browser.close()
# helper.py
import re
def logging_network_events(page):
# Logging network events
page.on("request", lambda request: print(
">>", request.method, request.url,
request.resource_type))
page.on("response", lambda response: print(
"<<", response.status, response.url))
def abort_image(page):
page.route("**/*", lambda route: route.abort() if route.request.resource_type == "image" else route.continue_())
def abort_image_by_ext(page):
page.route(re.compile("jpeg|jpg|png|tiff|gif"), lambda route: route.abort())
def abort_font_by_ext(page):
page.route(re.compile("woff2|woff|otf"), lambda route: route.abort())
在 JS 中:
首先为'post'元素定义定位器并选择第n个元素
滚动到这个元素
验证它在视图中可见。
const post = page.locator('postLocator').nth(75)
await post.scrollIntoViewIfNeeded();
// Make sure at least some part of element intersects viewport.
await expect(post).toBeInViewport();
参考:
https://playwright.dev/docs/api/class-locatorassertions#locator-assertions-to-be-in-viewport
这已解决,将
page.wait_for_timeout(1000)
添加到函数 press_end()
,并将 headless=False
添加到 playwright.chromium.launch()
:
def press_end():
page.keyboard.press("End")
page.wait_for_timeout(1000)
browser = playwright.chromium.launch(headless=False)
并添加检查第n个帖子是否可见,如果不可见,请继续按“结束”键,如下所示:
n = 72
ini_post_count = get_visible_post_count(page)
current_url_max = page.locator("//div[@data-e2e='user-post-item']//a").nth(n - 1).is_visible()
for i in range(ini_post_count, n, ini_post_count):
press_end()
wait_current_max = page.locator("//div[@data-e2e='user-post-item']//a").nth(post_count - 1).wait_for()
current_post_count = get_visible_post_count(page)
if not current_post_count < n:
break
由于某些与服务器端安全有关的原因,目前的抓取仅适用于
headless=False
模式,添加page.wait_for_timeout(1000)
以在到达页面底部后等待帖子加载是至关重要的。