使用 Selenium 缓慢抓取

问题描述 投票:0回答:1

我正在尝试使用selenium抓取网站。但是它非常慢。每条记录需要一分钟。

网页是https://jamabandi.nic.in/land%20records/NakalRecord。我正在努力刷新每一条记录。

那么有什么替代方案吗?我可以使用任何 api 端点或 HTTPS 请求吗

我的代码是

di=11
district_xpath_new = (By.XPATH, district_xpath)
dropdown_district=Select(handle_stale_element_reference(driver, district_xpath_new))
dropdown_district.select_by_index(di)
total_districts=len(Select(handle_stale_element_reference(driver, district_xpath_new)).options)
while(di<(total_districts)):
    time.sleep(5)
    driver,district_name=district_func(driver,di,district_xpath)
    print("District Started "+str(di))
    te=1
    driver,dropdown_tehsil,total_tehsils,tehsil_name=tehsil_func(driver,te,tehsil_xpath)
#     dropdown_tehsil.select_by_index(te)
    while(te<total_tehsils):
        time.sleep(5)
        print("Tehsil Started is"+str(te))
        driver,dropdown_tehsil,total_tehsils,tehsil_name=tehsil_func(driver,te,tehsil_xpath)
        vi=8
        driver,dropdown_village,total_vill,village_name=village_func(driver,vi,vill_xpath)
        while(vi<total_vill):
            time.sleep(5)
            print("Village Started is"+str(vi))
            driver,dropdown_village,total_vill,village_name=village_func(driver,vi,vill_xpath)
            ye=3
            driver,dropdown_year,total_year,year=year_func(driver,ye,year_xpath)
            while(ye<total_year):
                time.sleep(5)
                print("Year Started is"+str(ye))
                driver,dropdown_year,total_year,year=year_func(driver,ye,year_xpath)
                ow=2
                time.sleep(10)
                print("Selected Personal Ownerlist"+str(ow))
                driver,dropdown_owner=owner_drop(driver,ow,owner_dropdown_xpath)
                name=280
                driver,owner_name_drop,total_names,name_of_owner=owner_names_func(driver,name,owner_name_xpath)
                while(name<total_names):
                    print("Names Started is"+str(name))
                    time.sleep(2)
                    driver,owner_name_drop,total_names,name_of_owner=owner_names_func(driver,name,owner_name_xpath)
                    try:
                        if '?' not in name_of_owner:
                            print(name_of_owner)   
                            df_owner,driver=dataframe_check(driver,district_name,tehsil_name,village_name,year,name)
                            driver=select_all(driver,di,ye,te,vi,ow,name)
                        else:
                            pass
                    except:
                        print("Name is"+str(name))
                        print("Not Found")
                        name+=1
python selenium-webdriver web-scraping
1个回答
0
投票

Selenium 有很多开销(打开浏览器、渲染 HTML/CSS、javascript、再次浏览器等)。 大多数时候使用 Selenium 完成的工作可以减少为几个 HTTP 请求,这可以直接从 Python 完成,并且开销更少。

本质上就是这样。在网络抓取中,浏览器的开发人员工具是您的朋友。通过稍微查看“网络”选项卡,可以发现网页在每个选择上向网络服务器发出 POST 请求,以获取下一个类别的选项。

它还会向服务器发送一些状态变量。您基本上必须通过

requests
模块来模拟。

这是一个工作示例,您可以根据需要进行编辑:

import requests
from bs4 import BeautifulSoup as BS


URL = "https://jamabandi.nic.in/land%20records/NakalRecord"


def get_aspnet_form(soup: BS):
    form = {
        "__EVENTARGUMENT": "",
        "__LASTFOCUS": "",
        "__SCROLLPOSITIONX": "0",
        "__SCROLLPOSITIONY": "0",
        "ctl00$ContentPlaceHolder1$a": "RdobtnOwner",
    }

    forms = soup.find("form", attrs={"id": "aspnetForm"})
    for i in forms.find_all("input", recursive=False):
        form.update({i.attrs["name"]: i.attrs["value"]})

    return form


def get_options(soup: BS, type_: str) -> list:
    types = {
        "district": "ddldname",
        "tehsil": "ddltname",
        "village": "ddlvname",
        "period": "ddlPeriod",
        "owner": "ddlOwner",
        "record": "ListBox1",
    }
    ID = "ctl00_ContentPlaceHolder1_%s" % types[type_]
    select = soup.find("select", attrs={"id": ID})

    result = []
    for option in select.find_all("option"):
        value = option.attrs["value"]
        text = option.text
        if value != "-1":
            result.append((value, text))

    return [result, select.attrs["name"]]


def get_records(soup: BS):
    ID = "ctl00_ContentPlaceHolder1_ListBox1"
    records = soup.find("select", attrs={"id": ID})

    result = []
    for record in records.find_all("option"):
        name = record.attrs["value"]
        if "?" not in name:
            result.append(name)

    return result


if __name__ == "__main__":
    r = requests.get(URL)
    soup = BS(r.content, "html.parser")

    form = get_aspnet_form(soup)
    districts, district_event_target = get_options(soup, "district")
    for district_id, district_name in districts:
        print("Scraping from district %s:" % district_name)
        form["__EVENTTARGET"] = district_event_target
        form[district_event_target] = district_id

        soup = BS(requests.post(URL, data=form).content, "html.parser")

        form = get_aspnet_form(soup)
        tehsils, tehsil_event_target = get_options(soup, "tehsil")
        for tehsil_id, tehsil_name in tehsils:
            print("Scraping from tehsil %s:" % tehsil_name)
            form["__EVENTTARGET"] = tehsil_event_target
            form[district_event_target] = district_id
            form[tehsil_event_target] = tehsil_id

            soup = BS(requests.post(URL, data=form).content, "html.parser")

            form = get_aspnet_form(soup)
            villages, village_event_target = get_options(soup, "village")
            for village_id, village_name in villages:
                print("Scraping from village %s:" % village_name)
                form["__EVENTTARGET"] = village_event_target
                form[district_event_target] = district_id
                form[tehsil_event_target] = tehsil_id
                form[village_event_target] = village_id

                soup = BS(requests.post(URL, data=form).content, "html.parser")

                form = get_aspnet_form(soup)
                periods, period_event_target = get_options(soup, "period")
                for period_id, period_name in periods:
                    print("Scraping from period %s:" % period_name)
                    form["__EVENTTARGET"] = period_event_target
                    form[district_event_target] = district_id
                    form[tehsil_event_target] = tehsil_id
                    form[period_event_target] = period_id

                    soup = BS(requests.post(URL, data=form).content, "html.parser")

                    form = get_aspnet_form(soup)
                    owners, owner_event_target = get_options(soup, "owner")
                    form["__EVENTTARGET"] = owner_event_target
                    form[district_event_target] = district_id
                    form[tehsil_event_target] = tehsil_id
                    form[period_event_target] = period_id
                    form[owner_event_target] = "1"

                    soup = BS(requests.post(URL, data=form).content, "html.parser")

                    records = get_records(soup)
                    print(records)

我制作了 3 个实用函数来帮助减少意大利面条式代码。 这是代码的主循环:

  1. 通过
    GET
    请求获取正常网页。
  2. 使用 HTML 树解析器(如
    BeautifulSoup
    ),获取所有
    district
    选项。
  3. 现在获取要使用
    POST
    请求发送的表单变量
  4. 现在遍历每个地区并发送
    POST
    请求以获取有关该地区的网页。
  5. 现在在每次迭代时重复上述步骤,但对于 Tehsils
  6. 迭代 tehsils 以获得村庄
  7. 迭代村庄以获得周期
  8. 最后每个时期只有
    Niji
    所有者记录
  9. 冲洗并重复直至完成。

注意:对于依赖项,请执行

pip install requests bs4

© www.soinside.com 2019 - 2024. All rights reserved.