BeautifulSoup,Pandas中正则表达式的想法,要求下载到excel

问题描述 投票:0回答:1

我不知道要这样做。有谁知道如何在下面的代码中合并此代码(代码“ A”而不是代码“ B”)。我基本上是想将结果分成3列,Win / Loss / Tiee.g。 W,比分(例如2-1)和是否有加班(OT):

# Code A:
    rows = soup.find_all('div',class_="sidearm-schedule-game-row flex flex-wrap flex-align-center row")
    score = re.findall(r'([A-Z]),\s+([\d-]+)\s*(.*)', row.select_one('.sidearm-schedule-game-result').get_text(strip=True, separator=' '))[0]

# Code B

    for result in soup.findAll("div", {'class': 'sidearm-schedule-game-result'}):
        result = result.get_text(strip=True)
        res.append(result)
    if len(d) != len(res):
        res.append("None")

#  trying to replace Code B with Code A from above:

        import requests
        import re
        from bs4 import BeautifulSoup
        import pandas as pd
        from itertools import zip_longest

        d = []
        n = []
        res = []
        op = []
        yr = []
        with requests.Session() as req:
            for year in range(2003, 2020):
                print(f"Extracting Year# {year}")
                r = req.get(
                    f"https://lehighsports.com/sports/mens-soccer/schedule/{year}")
                if r.status_code == 200:
                    soup = BeautifulSoup(r.text, 'html.parser')
                    for date in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-date flex-item-1'}):
                        d.append(date.get_text(strip=True, separator=" "))
                    for name in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-name'}):
                        n.append(name.get_text(strip=True))
                    for result in soup.findAll("div", {'class': 'sidearm-schedule-game-result'}):
                        result = result.get_text(strip=True)
                        res.append(result)
                    if len(d) != len(res):
                        res.append("None")
                    for opp in soup.findAll("div", {'class': 'sidearm-schedule-game-opponent-text'}):
                        op.append(opp.get_text(strip=True, separator=' '))
                        yr.append(year)


        data = []
        for items in zip_longest(yr, d, n, op, res):
            data.append(items)

        df = pd.DataFrame(data, columns=['Year', 'Date', 'Name', 'opponent', 'Result']).to_excel('lehigh.xlsx', index=False)

regex python-3.x pandas beautifulsoup itertools
1个回答
1
投票
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re

year_id = ['2019','2018','2017','2016','2015','2014','2013','2012','2011','2010','2009','2008','2007','2006','2005','2004','2003']


results = pd.DataFrame()
for year in year_id: 
    url = 'https://lehighsports.com/sports/mens-soccer/schedule/' + year
    print (url)
    lehigh = requests.get(url).text
    soup = BeautifulSoup(lehigh,'lxml')

    rows = soup.find_all('div',class_="sidearm-schedule-game-row flex flex-wrap flex-align-center row")

    sheet = pd.DataFrame()
    for row in rows:
        date = row.find('div',class_="sidearm-schedule-game-opponent-date").text.strip()
        name = row.find('div',class_="sidearm-schedule-game-opponent-name").text.strip()
        opp = row.find('div',class_="sidearm-schedule-game-opponent-text").text.strip()
        conf = row.find('div',class_="sidearm-schedule-game-conference-conference").text.strip()

        try:
            result, score, ot  = re.findall(r'([A-Z]),\s+([\d-]+)\s*(.*)', row.select_one('.sidearm-schedule-game-result').get_text(strip=True, separator=' '))[0]
        except:
            result, score, ot = ('','','')

        df = pd.DataFrame([[year,date,name,opp,conf,result, score, ot]], columns=['year','date','opponent','list','conference','result', 'score', 'ot'])
        sheet = sheet.append(df,sort=True).reset_index(drop=True)

    results = results.append(sheet, sort=True).reset_index(drop=True)

results.to_excel('lehigh.xlsx')
© www.soinside.com 2019 - 2024. All rights reserved.