我正在尝试从网站上抓取数据,但有点陷入多页面问题。不知何故,我的迭代总是导致“所有数组必须具有相同的长度”有人可以帮助我哪里做错了吗?下面是我使用的代码
import requests
from bs4 import BeautifulSoup
import pandas as pd
def replaced(text):
return text.replace('\n\n\n\n\n','')
total_page = 3
current_page = 1
judul_list = []
harga_list = []
distance = []
transmit = []
location = []
sp = []
rec_seller = []
while current_page <= total_page:
url = f"https://www.mobil123.com/mobil-dijual/indonesia?page_number={current_page}&page_size=25"
req = requests.get
headers = {"User-Agent": }
page_request = requests.get(url, headers=headers)
soup = BeautifulSoup(page_request.content, "html.parser")
containers = soup.find_all('div', {'class' : 'grid'})
container = containers[0]
judul = container.findAll('h2', {'class' : 'listing__title epsilon flush'})
judul_list += [replaced(i.text) for i in judul]
harga = container.findAll('div', {'class' : 'listing__price delta weight--bold'})
harga_list += [replaced(j.text) for j in harga]
specs = container.findAll('div', {'class' : 'listing__specs soft-quarter--ends soft-half--sides milli'})
specs_list = [replaced(k.text) for k in specs]
distance += [k.split('|')[1].strip() for k in specs_list]
transmit += [k.split('|')[2].strip() for k in specs_list]
location += [k.split('|')[3].strip() for k in specs_list]
sp += [k.split('|')[4].strip() for k in specs_list]
rec_seller += [k.split('|')[5].strip() for k in specs_list]
current_page += 1
tahun = [a.split()[0].strip('|') for a in judul_list]
merek = [a.split()[1].strip('|') for a in judul_list]
series = [a.split()[2].strip('|') for a in judul_list]
# Create DataFrame
data = {
'Tahun': tahun,
'Merek': merek,
'Series': series,
'Harga': harga_list,
'Distance': distance,
'Transmit': transmit,
'Location': location,
'SP': sp,
'Rec_Seller': rec_seller
}
如有任何帮助,我们将不胜感激!
不要使用无法保证长度相等的多个不同的列表,只需尝试使用
list
的 dictionaries
- 这还有一个迷人的优点,即在转换为 dataframe
期间会忽略缺失值。
import requests
from bs4 import BeautifulSoup
import pandas as pd
total_page = 3
current_page = 1
data = []
while current_page <= total_page:
url = f"https://www.mobil123.com/mobil-dijual/indonesia?page_number={current_page}&page_size=25"
req = requests.get
headers = {"User-Agent": ''}
page_request = requests.get(url, headers=headers)
soup = BeautifulSoup(page_request.content, "html.parser")
for e in soup.select('article.listing'):
d = {
'judul': e.h2.get_text(strip=True),
'harga': e.find('div', {'class' : 'listing__price delta weight--bold'}).get_text(strip=True) if e.find('div', {'class' : 'listing__price delta weight--bold'}) else None
}
d.update({e.get('class')[-1].split('--')[-1]:e.next for e in soup.select('.listing__specs i')})
data.append(d)
current_page += 1
pd.DataFrame(data)