Python网络搜刮。复制和输出显示问题

问题描述 投票:0回答:1

我的代码有一个问题,我试过了,但无法确定。它涉及到循环的输出不显示和正确插入我的DB.我希望每个刮行的数据打印作为输出,然后插入到数据库表中。到目前为止,我得到的只是一个结果,打印成多次重复(甚至没有正确的价格)。

当前实际输出。

Ford C-MAX 2019 1.1 Petrol 0
Ford C-MAX 2019 1.1 Petrol 0
Ford C-MAX 2019 1.1 Petrol 0
...

按网页广告的要求输出(只是一个例子,因为它是动态的)。

Ford C-MAX 2019 1.1 Petrol 15950
Ford C-MAX 2014 1.6 Diesel 12000
Ford C-MAX 2011 1.6 Diesel 9000
...

代码:

from __future__ import print_function
import requests
import re
import locale
import time
from time import sleep
from random import randint
from currency_converter import CurrencyConverter
c = CurrencyConverter()
from bs4 import BeautifulSoup
import pandas as pd
from datetime import date, datetime, timedelta
import mysql.connector
import numpy as np
import itertools

locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' )

pages = np.arange(0, 210, 30)

entered = datetime.now()
make = "Ford"
model = "C-MAX"


def insertvariablesintotable(make, model, year, liter, fuel, price, entered):
    try:
        cnx = mysql.connector.connect(user='root', password='', database='FYP', host='127.0.0.2', port='8000')
        cursor = cnx.cursor()

        cursor.execute('CREATE TABLE IF NOT EXISTS ford_cmax ( make VARCHAR(15), model VARCHAR(20), '
                       'year INT(4), liter VARCHAR(3), fuel VARCHAR(6), price INT(6), entered TIMESTAMP) ')

        insert_query = """INSERT INTO ford_cmax (make, model, year, liter, fuel, price, entered) VALUES (%s,%s,%s,%s,%s,%s,%s)"""
        record = (make, model, year, liter, fuel, price, entered)

        cursor.execute(insert_query, record)

        cnx.commit()

    finally:
        if (cnx.is_connected()):
            cursor.close()
            cnx.close()

for response in pages:

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    response = requests.get("https://www.donedeal.ie/cars/Ford/C-MAX?start=" + str(response), headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    cnx = mysql.connector.connect(user='root', password='', database='FYP', host='127.0.0.2', port='8000')
    cursor = cnx.cursor()

    for details in soup.findAll('ul', attrs={'class': 'card__body-keyinfo'}):

        details = details.text
        #print(details)
        year = details[:4]
        liter = details[4:7]
        fuel = details[8:14] #exludes electric which has 2 extra
        mileage = re.findall("[0-9]*,[0-9][0-9][0-9]..." , details)
        mileage = ''.join(mileage)
        mileage = mileage.replace(",", "")
        if "mi" in mileage:
            mileage = mileage.rstrip('mi')
            mileage = round(float(mileage) * 1.609)
        mileage = str(mileage)
        if "km" in mileage:
            mileage = mileage.rstrip('km')
        mileage = mileage.replace("123" or "1234" or "12345" or "123456", "0")

    for price in soup.findAll('p', attrs={'class': 'card__price'}):

        price = price.text
        price = price.replace("No Price", "0")
        price = price.replace("123" or "1234" or "12345" or "123456", "0")
        price = price.replace(",","")
        price = price.replace("€", "")
        if "p/m" in price:
            #price = price[:-3]
            price = price.rstrip('p/m')
            price = "0"
        if "£" in price:
            price = price.replace("£", "")
            price = c.convert(price, 'GBP', 'EUR')
            price = round(price)

    print(make, model, year, liter, fuel, price)

    #insertvariablesintotable(make, model, year, liter, fuel, price, entered) #same result as above
python mysql beautifulsoup screen-scraping
1个回答
1
投票

我看了一下你的代码和你要获取数据的网站,看起来你是在检索页面,然后在所有的价格上进行循环,你从该页面上获取的数据使用了 price 作为一个变量,但每次进入for循环时都会覆盖它。你的详细信息也是如此 for 循环。

下面是你可以尝试的替代方法。

make = "Ford"
model = "C-MAX"
price_list = [] # we will store prices here
details_list = [] # and details like year, liter, mileage there
for response in range(1,60,30): # I changed to a range loop for testing

    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
    }
    response = requests.get(
        "https://www.donedeal.ie/cars/Ford/C-MAX?start=" + str(response),
        headers=headers,
    )
    soup = BeautifulSoup(response.text, "html.parser")
    count = 0
    for details in soup.findAll("ul", attrs={"class": "card__body-keyinfo"}):
        if count == 30:
            break # Takes us out of the for loop
        details = details.text
        # print(details)
        year = details[:4]
        liter = details[4:7]
        fuel = details[8:14]  # exludes electric which has 2 extra
        mileage = re.findall("[0-9]*,[0-9][0-9][0-9]...", details)
        mileage = "".join(mileage)
        mileage = mileage.replace(",", "")
        if "mi" in mileage:
            mileage = mileage.rstrip("mi")
            mileage = round(float(mileage) * 1.609)
        mileage = str(mileage)
        if "km" in mileage:
            mileage = mileage.rstrip("km")
        mileage = mileage.replace("123" or "1234" or "12345" or "123456", "0")
        details_list.append((year, liter, fuel, mileage)) # end of one loop go-through, we append
        count += 1 We update count value 
    count = 0
    for price in soup.findAll("p", attrs={"class": "card__price"}):
        if count == 30:
            break # Takes us out of the for loop
        price = price.text
        price = price.replace("No Price", "0")
        price = price.replace("123" or "1234" or "12345" or "123456", "0")
        price = price.replace(",", "")
        price = price.replace("€", "")
        if "£" in price:
            price = price.replace("£", "")
            price = c.convert(price, "GBP", "EUR")
            price = round(price)
        if "p/m" in price:
            # price = price[:-3]
            price = price.rstrip("p/m")
            price = "0"
        else:
            price_list.append(price) # end of loop go-through, we append but only if it is not a "p/m" price
            count += 1 # We update count value only when a value is appended to the list

for i in range(len(price_list)):
    print(
    make,
    model,
    details_list[i][0],
    details_list[i][1],
    details_list[i][2],
    price_list[i],
)
    #add your insertvariablesintotable(make,model,details_list[i][0], details_list[i][1],details_list[i][2],price_list[i]) there

编辑: 我没有把pm价格添加到列表中,因为它们使details_list和price_list的长度不同。如果你想把pm价格也加进去,你就得重新编写代码。另外,你不希望在页面的最下面有3辆车,因为它们可能不是福特C-MAX,而是其他车型,甚至可能是其他厂家的车型。

© www.soinside.com 2019 - 2024. All rights reserved.