Python - 保存网页抓取文件 - 波兰语字符编码时出错

问题描述 投票:0回答:1

我创建了一个代码块,它可以在网络上抓取波兰网站的房产列表信息。

import bs4
import csv
from urllib.request import urlopen as Open
from urllib.request import Request
from bs4 import BeautifulSoup as soup

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
results = "https://www.otodom.pl/sprzedaz/mieszkanie/?nrAdsPerPage=72&search%5Border%5D=created_at_first%3Adesc&page=1"
req = Request(url=results, headers=headers) 
html = Open(req).read()




page_soup = soup(html, "html.parser")
total_pages = int(page_soup.find("div",{"class":"after-offers clearfix"}).find("ul",{"class":"pager"}).findAll("li")[4].text)

offer_list = []
offer_list.append(["Price", 
    "Location", 
    "Forma własności",
    "Liczba pięter",
    "Liczba pokoi",
    "Materiał budynku",
    "Ogrzewanie",
    "Ogrzewanie",
    "Okna",
    "Okna",
    "Piętro",
    "Powierzchnia",
    "Rodzaj zabudowy",
    "Rok budowy",
    "Rynek",
    "Stan wykończenia",
    "Link"])

for page in range(0, 1):
    page += 1
    print(page)
    results = "https://www.otodom.pl/sprzedaz/mieszkanie/?nrAdsPerPage=72&search%5Border%5D=created_at_first%3Adesc&page="+str(page)
    #print(results)

    req = Request(url=results, headers=headers) 
    html = Open(req).read()

    page_soup = soup(html, "html.parser")

    listings = page_soup.findAll("article",{"data-featured-name":"listing_no_promo"})
    #print(len(listings))

    for i in listings:
        listing = i.a.get("href")
        req = Request(url=listing, headers=headers) 
        html = Open(req).read()

        page_soup = soup(html, "html.parser")

        # get location

        location = page_soup.find("a", {"href":"#map"}).text.split("}")[2]

        # get price

        price = page_soup.find("div", {"class":"css-1vr19r7"}).text.replace(" ","").replace("zł","")

        # get property features

        container = page_soup.find("section", {"class":"section-overview"}).findNext("div").ul.findAll("li")

        features = []

        for feature in ["Forma własności",
    "Liczba pięter",
    "Liczba pokoi",
    "Materiał budynku",
    "Ogrzewanie",
    "Okna",
    "Piętro",
    "Powierzchnia",
    "Rodzaj zabudowy",
    "Rok budowy",
    "Rynek",
    "Stan wykończenia"
                       ]:
            for contain in container:
                if feature in contain.text:
                    features.append(contain.text.split(":")[1].replace(" m²",""))
                    break
            else:  # if we didn't break
                features.append("N/A")


        offer = [price, location, *features, listing]
        offer_list.append(offer)

with open ('filename.csv','w', encoding='utf-8') as file:
   writer=csv.writer(file)
   for row in offer_list:
      writer.writerow(row)

print("data saved")

我已经到达保存文件的阶段,但是波兰字体被破坏了,例如

Åódź, łódzkie

有没有办法让它将波兰语字符转换为纯拉丁语,例如

ó
o
,还是保持它们不变?

python web-scraping encoding polish
1个回答
0
投票

这很有趣,但是当我在编写解析来自... otodom的数据的脚本时遇到同样的问题时,我发现了你的帖子:)

这就是我解决这个问题的方法:

headers = {"Accept-Language": "pl"}
...
s = HTMLSession()
response = s.get(link.strip(), headers=headers)
response.html.render()
response.encoding = response.apparent_encoding
© www.soinside.com 2019 - 2024. All rights reserved.