使用Selenium来获取Strava排行榜表格

问题描述 投票:0回答:1

我有下面的代码,我试图从这个表中获取一个表。网页. 我想搜刮 "本周排行榜",但实际拿出来的表格却很麻烦。有什么好办法可以得到表格?我只是得到HTML输出,真的不想 regex 一起。

### Libraries/packages
import pandas as pd
import numpy as np
import re
import requests
import datetime
from datetime import datetime
import urllib
from selenium import webdriver
from selenium.webdriver.chrome.options import Options 
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from xml.etree import ElementTree as ET
from functools import reduce
import smtplib



### Function 1
def strava_page():

    # in this example, I'm using a public group. In my code there are extra steps to login, which I omit here
    urllist = ['https://www.strava.com/login',
               'https://www.strava.com/clubs/roosevelt-island-dc-parkrun']

    return urllist


### Function 2
def strava_login(urllist):

    # open login page
    #url = urllist[0]
    driver = webdriver.Chrome(executable_path = r"/Users/user/Documents/chromedriver")
    #driver.get(url)
    #login = driver.find_element_by_xpath('//*[@id="login_form"]/div[2]/a')
    #login.click()

    # input username login 
    #emailbox = driver.find_element_by_xpath('//*[@id="identifierId"]')
    #emailbox.send_keys('user') 
    #emailbox.send_keys(u'\ue007')

    # input password
    #passbox = driver.find_element_by_id("identifierNext")
    #passbox = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//input[@name='password']")))
    #passbox.send_keys('password')
    #passbox.send_keys(u'\ue007')

    # navigate to Strava group page
    mayrun = driver.get(urllist[1])
    mayrun

    # get "This Week's Leaderboard"
    try:
        WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'body > div.view > div.page.container > div:nth-child(4) > div.spans11 > div > div:nth-child(2) > div.leaderboard > table > tbody')))
    except TimeoutException:
        print("Request Timed Out...Idiot")
    table = driver.find_element_by_css_selector('body > div.view > div.page.container > div:nth-child(4) > div.spans11 > div > div:nth-child(2) > div.leaderboard > table > tbody').get_attribute('innerHTML')
    print(table)


    driver.close()

    return table

### Call functions
one = strava_page()
two = strava_login(one)
two

我得到了这个回报,但已经尝试了各种。xpathcss selector 路径来尝试获取表格本身,加上一些 pd.read_html()pd.DataFrame() 尝试失败。

<td class="rank">1</td>
<td class="athlete">
<div class="avatar avatar-athlete avatar-sm">
<a class="avatar-content" href="/athletes/34781197">
<div class="avatar-img-wrapper avatar-default">

<img alt="Chi H." src="/assets/avatar/athlete/medium.png" title="Chi H.">
</div>
</a>
</div>
<a class="athlete-name minimal" href="/athletes/34781197">
Chi H.
</a>
</td>
<td class="distance highlighted-column">86.3 <abbr class="unit short" title="kilometers">km</abbr></td>
<td class="num-activities">8</td>
<td class="longest-activity">
42.2 <abbr class="unit short" title="kilometers">km</abbr>
</td>
<td class="average-pace">7:37 <abbr class="unit short" title="minutes per kilometer">/km</abbr></td>
<td class="elev-gain">528 <abbr class="unit short" title="meters">m</abbr></td>
</tr><tr>
<td class="rank">2</td>
<td class="athlete">
<div class="avatar avatar-athlete avatar-sm">
<a class="avatar-content" href="/athletes/1802199">
<div class="avatar-img-wrapper avatar-default">

<img alt="Andrew P." src="/assets/avatar/athlete/medium.png" title="Andrew P.">
</div>
</a>
</div>
<a class="athlete-name minimal" href="/athletes/1802199">
Andrew P.
</a>
</td>
<td class="distance highlighted-column">74.6 <abbr class="unit short" title="kilometers">km</abbr></td>
<td class="num-activities">7</td>
<td class="longest-activity">
22.5 <abbr class="unit short" title="kilometers">km</abbr>
</td>
<td class="average-pace">5:55 <abbr class="unit short" title="minutes per kilometer">/km</abbr></td>
<td class="elev-gain">3,685 <abbr class="unit short" title="meters">m</abbr></td>
</tr><tr>
<td class="rank">3</td>
<td class="athlete">
<div class="avatar avatar-athlete avatar-sm">
<a class="avatar-content" href="/athletes/4702810">
<div class="avatar-img-wrapper avatar-default">

<img alt="Sarah Reese B." src="/assets/avatar/athlete/medium.png" title="Sarah Reese B.">
</div>
</a>
</div>
<a class="athlete-name minimal" href="/athletes/4702810">
Sarah Reese B.
</a>
</td>
<td class="distance highlighted-column">70.1 <abbr class="unit short" title="kilometers">km</abbr></td>
<td class="num-activities">11</td>
<td class="longest-activity">
12.8 <abbr class="unit short" title="kilometers">km</abbr>
</td>
<td class="average-pace">4:49 <abbr class="unit short" title="minutes per kilometer">/km</abbr></td>
<td class="elev-gain">298 <abbr class="unit short" title="meters">m</abbr></td>
</tr><tr>
<td class="rank">4</td>
<td class="athlete">
<div class="avatar avatar-athlete avatar-sm">
<a class="avatar-content" href="/athletes/1284779">
<div class="avatar-img-wrapper avatar-default">

Etc.

Etc. 
python html css selenium
1个回答
0
投票

我想,如果只是调用服务,并以json格式获取所有数据,会更容易。然而,这可能需要一个cookie来进行请求。我试着用硒来刮取表的标题和行数------。

driver.get('https://www.strava.com/clubs/roosevelt-island-dc-parkrun-215283')

#Find the Search Button and return all search results
table_headers =[]
for items in driver.find_elements_by_xpath("//div[@class='leaderboard']/table/thead//th"):
    table_headers.append(items.text)

print(table_headers)
table_rows = []
myrow = []
totalrows = len(driver.find_elements_by_xpath("//div[@class='leaderboard']/table/tbody//tr"))

for i in range(totalrows):
    myrow.clear()
    for items in driver.find_elements_by_xpath("//div[@class='leaderboard']/table/tbody//tr["+str(i+1)+"]/td"):
        myrow.append(items.text)

    table_rows.append(myrow)
    print(myrow)
print(table_rows)

driver.quit()

产出:

['Rank', 'Athlete', 'Distance', 'Runs', 'Longest', 'Avg. Pace', 'Elev. Gain']
['1', 'Chi H.', '86.3 km', '8', '42.2 km', '7:37 /km', '528 m']
['2', 'Lokesh M.', '47.5 km', '4', '13.2 km', '5:28 /km', '1,046 m']
['3', 'Sarah Reese B.', '47.5 km', '9', '12.8 km', '4:45 /km', '168 m']
['4', 'SP R.', '46.2 km', '3', '20.2 km', '4:40 /km', '128 m']
['5', 'Sharada P.', '41.4 km', '4', '12.4 km', '8:46 /km', '--']
© www.soinside.com 2019 - 2024. All rights reserved.