我正在尝试使用selenium抓取网站。但是它非常慢。每条记录需要一分钟。
网页是https://jamabandi.nic.in/land%20records/NakalRecord。我正在努力刷新每一条记录。
那么有什么替代方案吗?我可以使用任何 api 端点或 HTTPS 请求吗
我的代码是
di=11
district_xpath_new = (By.XPATH, district_xpath)
dropdown_district=Select(handle_stale_element_reference(driver, district_xpath_new))
dropdown_district.select_by_index(di)
total_districts=len(Select(handle_stale_element_reference(driver, district_xpath_new)).options)
while(di<(total_districts)):
time.sleep(5)
driver,district_name=district_func(driver,di,district_xpath)
print("District Started "+str(di))
te=1
driver,dropdown_tehsil,total_tehsils,tehsil_name=tehsil_func(driver,te,tehsil_xpath)
# dropdown_tehsil.select_by_index(te)
while(te<total_tehsils):
time.sleep(5)
print("Tehsil Started is"+str(te))
driver,dropdown_tehsil,total_tehsils,tehsil_name=tehsil_func(driver,te,tehsil_xpath)
vi=8
driver,dropdown_village,total_vill,village_name=village_func(driver,vi,vill_xpath)
while(vi<total_vill):
time.sleep(5)
print("Village Started is"+str(vi))
driver,dropdown_village,total_vill,village_name=village_func(driver,vi,vill_xpath)
ye=3
driver,dropdown_year,total_year,year=year_func(driver,ye,year_xpath)
while(ye<total_year):
time.sleep(5)
print("Year Started is"+str(ye))
driver,dropdown_year,total_year,year=year_func(driver,ye,year_xpath)
ow=2
time.sleep(10)
print("Selected Personal Ownerlist"+str(ow))
driver,dropdown_owner=owner_drop(driver,ow,owner_dropdown_xpath)
name=280
driver,owner_name_drop,total_names,name_of_owner=owner_names_func(driver,name,owner_name_xpath)
while(name<total_names):
print("Names Started is"+str(name))
time.sleep(2)
driver,owner_name_drop,total_names,name_of_owner=owner_names_func(driver,name,owner_name_xpath)
try:
if '?' not in name_of_owner:
print(name_of_owner)
df_owner,driver=dataframe_check(driver,district_name,tehsil_name,village_name,year,name)
driver=select_all(driver,di,ye,te,vi,ow,name)
else:
pass
except:
print("Name is"+str(name))
print("Not Found")
name+=1
Selenium 有很多开销(打开浏览器、渲染 HTML/CSS、javascript、再次浏览器等)。 大多数时候使用 Selenium 完成的工作可以减少为几个 HTTP 请求,这可以直接从 Python 完成,并且开销更少。
本质上就是这样。在网络抓取中,浏览器的开发人员工具是您的朋友。通过稍微查看“网络”选项卡,可以发现网页在每个选择上向网络服务器发出 POST 请求,以获取下一个类别的选项。
它还会向服务器发送一些状态变量。您基本上必须通过
requests
模块来模拟。
这是一个工作示例,您可以根据需要进行编辑:
import requests
from bs4 import BeautifulSoup as BS
URL = "https://jamabandi.nic.in/land%20records/NakalRecord"
def get_aspnet_form(soup: BS):
form = {
"__EVENTARGUMENT": "",
"__LASTFOCUS": "",
"__SCROLLPOSITIONX": "0",
"__SCROLLPOSITIONY": "0",
"ctl00$ContentPlaceHolder1$a": "RdobtnOwner",
}
forms = soup.find("form", attrs={"id": "aspnetForm"})
for i in forms.find_all("input", recursive=False):
form.update({i.attrs["name"]: i.attrs["value"]})
return form
def get_options(soup: BS, type_: str) -> list:
types = {
"district": "ddldname",
"tehsil": "ddltname",
"village": "ddlvname",
"period": "ddlPeriod",
"owner": "ddlOwner",
"record": "ListBox1",
}
ID = "ctl00_ContentPlaceHolder1_%s" % types[type_]
select = soup.find("select", attrs={"id": ID})
result = []
for option in select.find_all("option"):
value = option.attrs["value"]
text = option.text
if value != "-1":
result.append((value, text))
return [result, select.attrs["name"]]
def get_records(soup: BS):
ID = "ctl00_ContentPlaceHolder1_ListBox1"
records = soup.find("select", attrs={"id": ID})
result = []
for record in records.find_all("option"):
name = record.attrs["value"]
if "?" not in name:
result.append(name)
return result
if __name__ == "__main__":
r = requests.get(URL)
soup = BS(r.content, "html.parser")
form = get_aspnet_form(soup)
districts, district_event_target = get_options(soup, "district")
for district_id, district_name in districts:
print("Scraping from district %s:" % district_name)
form["__EVENTTARGET"] = district_event_target
form[district_event_target] = district_id
soup = BS(requests.post(URL, data=form).content, "html.parser")
form = get_aspnet_form(soup)
tehsils, tehsil_event_target = get_options(soup, "tehsil")
for tehsil_id, tehsil_name in tehsils:
print("Scraping from tehsil %s:" % tehsil_name)
form["__EVENTTARGET"] = tehsil_event_target
form[district_event_target] = district_id
form[tehsil_event_target] = tehsil_id
soup = BS(requests.post(URL, data=form).content, "html.parser")
form = get_aspnet_form(soup)
villages, village_event_target = get_options(soup, "village")
for village_id, village_name in villages:
print("Scraping from village %s:" % village_name)
form["__EVENTTARGET"] = village_event_target
form[district_event_target] = district_id
form[tehsil_event_target] = tehsil_id
form[village_event_target] = village_id
soup = BS(requests.post(URL, data=form).content, "html.parser")
form = get_aspnet_form(soup)
periods, period_event_target = get_options(soup, "period")
for period_id, period_name in periods:
print("Scraping from period %s:" % period_name)
form["__EVENTTARGET"] = period_event_target
form[district_event_target] = district_id
form[tehsil_event_target] = tehsil_id
form[period_event_target] = period_id
soup = BS(requests.post(URL, data=form).content, "html.parser")
form = get_aspnet_form(soup)
owners, owner_event_target = get_options(soup, "owner")
form["__EVENTTARGET"] = owner_event_target
form[district_event_target] = district_id
form[tehsil_event_target] = tehsil_id
form[period_event_target] = period_id
form[owner_event_target] = "1"
soup = BS(requests.post(URL, data=form).content, "html.parser")
records = get_records(soup)
print(records)
我制作了 3 个实用函数来帮助减少意大利面条式代码。 这是代码的主循环:
GET
请求获取正常网页。BeautifulSoup
),获取所有 district
选项。POST
请求发送的表单变量POST
请求以获取有关该地区的网页。Niji
所有者记录注意:对于依赖项,请执行
pip install requests bs4