我创建了一个代码块,它可以在网络上抓取波兰网站的房产列表信息。
import bs4
import csv
from urllib.request import urlopen as Open
from urllib.request import Request
from bs4 import BeautifulSoup as soup
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
results = "https://www.otodom.pl/sprzedaz/mieszkanie/?nrAdsPerPage=72&search%5Border%5D=created_at_first%3Adesc&page=1"
req = Request(url=results, headers=headers)
html = Open(req).read()
page_soup = soup(html, "html.parser")
total_pages = int(page_soup.find("div",{"class":"after-offers clearfix"}).find("ul",{"class":"pager"}).findAll("li")[4].text)
offer_list = []
offer_list.append(["Price",
"Location",
"Forma własności",
"Liczba pięter",
"Liczba pokoi",
"Materiał budynku",
"Ogrzewanie",
"Ogrzewanie",
"Okna",
"Okna",
"Piętro",
"Powierzchnia",
"Rodzaj zabudowy",
"Rok budowy",
"Rynek",
"Stan wykończenia",
"Link"])
for page in range(0, 1):
page += 1
print(page)
results = "https://www.otodom.pl/sprzedaz/mieszkanie/?nrAdsPerPage=72&search%5Border%5D=created_at_first%3Adesc&page="+str(page)
#print(results)
req = Request(url=results, headers=headers)
html = Open(req).read()
page_soup = soup(html, "html.parser")
listings = page_soup.findAll("article",{"data-featured-name":"listing_no_promo"})
#print(len(listings))
for i in listings:
listing = i.a.get("href")
req = Request(url=listing, headers=headers)
html = Open(req).read()
page_soup = soup(html, "html.parser")
# get location
location = page_soup.find("a", {"href":"#map"}).text.split("}")[2]
# get price
price = page_soup.find("div", {"class":"css-1vr19r7"}).text.replace(" ","").replace("zł","")
# get property features
container = page_soup.find("section", {"class":"section-overview"}).findNext("div").ul.findAll("li")
features = []
for feature in ["Forma własności",
"Liczba pięter",
"Liczba pokoi",
"Materiał budynku",
"Ogrzewanie",
"Okna",
"Piętro",
"Powierzchnia",
"Rodzaj zabudowy",
"Rok budowy",
"Rynek",
"Stan wykończenia"
]:
for contain in container:
if feature in contain.text:
features.append(contain.text.split(":")[1].replace(" m²",""))
break
else: # if we didn't break
features.append("N/A")
offer = [price, location, *features, listing]
offer_list.append(offer)
with open ('filename.csv','w', encoding='utf-8') as file:
writer=csv.writer(file)
for row in offer_list:
writer.writerow(row)
print("data saved")
我已经到达保存文件的阶段,但是波兰字体被破坏了,例如
Åódź, łódzkie
有没有办法让它将波兰语字符转换为纯拉丁语,例如
ó
到 o
,还是保持它们不变?
这很有趣,但是当我在编写解析来自... otodom的数据的脚本时遇到同样的问题时,我发现了你的帖子:)
这就是我解决这个问题的方法:
headers = {"Accept-Language": "pl"}
...
s = HTMLSession()
response = s.get(link.strip(), headers=headers)
response.html.render()
response.encoding = response.apparent_encoding