有没有办法去除Python代码中多余的间距?

问题描述 投票:0回答:1

我下面的代码得到了每个健身房的街道地址,但是在输出的健身房开放时间的间距上出现了错误。请问我哪里出错了?

import urlparse

from bs4 import BeautifulSoup
from bs4 import Tag
import requests
import time
import csv

sitemap = 'https://www.planetfitness.com/sitemap'
sitemap_content = requests.get(sitemap).content
soup = BeautifulSoup(sitemap_content, 'html.parser')

atags = soup.select('td[class~=club-title] > a[href^="/gyms"]')
links = [atag.get('href') for atag in atags]

with open('gyms.csv', 'w') as gf:
    gymwriter = csv.writer(gf)
    for link in links:
        gymurl = urlparse.urljoin(sitemap, link)
        sitemap_content = requests.get(gymurl).content
        soup = BeautifulSoup(sitemap_content, 'html.parser')
        gymrow = [ gymurl ]

        address_line1 = soup.select('p[class~=address] > span[class~=address-line1]')
        gymrow.append(address_line1[0].text)
        locality = soup.select('p[class~=address] > span[class~=locality]')
        gymrow.append(locality[0].text)
        administrative_area = soup.select('p[class~=address] > span[class~=administrative-area]')
        gymrow.append(administrative_area[0].text)
        postal_code = soup.select('p[class~=address] > span[class~=postal-code]')
        gymrow.append(postal_code[0].text)
        country = soup.select('p[class~=address] > span[class~=country]')
        gymrow.append(country[0].text)

        strongs = soup.select('div > strong')
        for strong in strongs:
            if strong.text == 'Club Hours':
                for sibling in strong.next_siblings:
                    if isinstance(sibling, Tag):
                        hours = sibling.text
                        gymrow.append(hours)
                        break
        print(gymrow)
        gymwriter.writerow(gymrow)
        time.sleep(3)

谢谢您的帮助

python web-scraping beautifulsoup python-requests href
1个回答
1
投票

您想选择的是 td 元素 club-title),其中包含 a 元素,并提取 href 属性。

from bs4 import BeautifulSoup
from bs4 import Tag
import requests
import urllib.parse
import time
import csv

sitemap = 'https://www.planetfitness.com/sitemap'
res = requests.get(sitemap).content
soup = BeautifulSoup(res, 'html.parser')

# The rows in the table of gyms are formatted like so:
# <tr>
# <td class="club-title"><a href="/gyms/albertville-al"><strong>Albertville, AL</strong> <p>5850 US Hwy 431</p></a></td>
# <td class="club-join"><div class="button"><a href="/gyms/albertville-al/offers" title="Join Albertville, AL">Join Now</a></div></td>
# </tr>

# This will find all the links to all the gyms.
atags = soup.select('td[class~=club-title] > a[href^="/gyms"]')
links = [atag.get('href') for atag in atags]

with open('gyms.csv', 'w') as gf:
    gymwriter = csv.writer(gf)
    for link in links:
        # Follow the link to this gym
        gymurl = urllib.parse.urljoin(sitemap, link)
        res = requests.get(gymurl).content
        soup = BeautifulSoup(res, 'html.parser')
        gymrow = [ gymurl ]

        # The address of this gym.
        address_line1 = soup.select('p[class~=address] > span[class~=address-line1]')
        gymrow.append(address_line1[0].text)
        locality = soup.select('p[class~=address] > span[class~=locality]')
        gymrow.append(locality[0].text)
        administrative_area = soup.select('p[class~=address] > span[class~=administrative-area]')
        gymrow.append(administrative_area[0].text)
        postal_code = soup.select('p[class~=address] > span[class~=postal-code]')
        gymrow.append(postal_code[0].text)
        country = soup.select('p[class~=address] > span[class~=country]')
        gymrow.append(country[0].text)

        # The hours of this gym.
        strongs = soup.select('div > strong')
        for strong in strongs:
            if strong.text == 'Club Hours':
                for sibling in strong.next_siblings:
                    if isinstance(sibling, Tag):
                        hours = sibling.text
                        gymrow.append(hours.replace('<br>', '').replace('\n', ', '))
                        break

        gymwriter.writerow(gymrow)
        time.sleep(3)

当我运行这个时,我得到了。

$ more gyms.csv

https://www.planetfitness.com/gyms/albertville-al,5850 US Hwy 431,Albertville,AL,35950,United States,"Monday-Friday 6am-9pm, Sat
urday-Sunday 7am-7pm"
https://www.planetfitness.com/gyms/alexander-city-al,987 Market Place,Alexander City,AL,35010,United States,Convenient hours whe
n we reopen
https://www.planetfitness.com/gyms/bessemer-al,528 W Town Plaza,Bessemer,AL,35020,United States,Convenient hours when we reopen
https://www.planetfitness.com/gyms/birmingham-crestline-al,4500 Montevallo Rd,Birmingham,AL,35210,United States,Convenient hours
 when we reopen
.
.
.

0
投票

要试着调试这个,你应该先打印出atags的值。你正在搜索所有的 a 的标签。clubs-list 其中没有任何一个国家。该 a 标签没有类,但它们的父类 td 具有阶级性 club-title.

你可以试试这样的东西。

res = requests.get("https://www.planetfitness.com/sitemap").content
soup = BeautifulSoup(res, 'html.parser')

tds = soup.find_all('td', {'class': 'club-title'})
links = [td.find('a')['href'] for td in tds]
keywords = ['gyms']

for link in links:
    if any(keyword in link for keyword in keywords):
        print(link)

0
投票

这将得到该页面的每一个链接和地址。看起来如果你想找到每个俱乐部的更多信息,你就必须反复浏览和加载每个页面。

from bs4 import BeautifulSoup
import requests

res = requests.get("https://www.planetfitness.com/sitemap").content
soup = BeautifulSoup(res, 'html.parser')

atags = soup.find_all('td', {'class':'club-title'})

links = [(atag.find('a')['href'], atag.find('p').text) for atag in atags)]


[print(link) for link in links]
© www.soinside.com 2019 - 2024. All rights reserved.