python 报废网站数据提供空值

问题描述 投票:0回答:1

我正在尝试废弃 MLB BOX 分数和逐场比赛信息。


import requests
from bs4 import BeautifulSoup

url = "https://www.sportsnet.ca/baseball/mlb/games/2618275/"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

# Capture values from linescore__container class
linescore_container = soup.find("div", class_="linescore__container")
teams = [team.text.strip() for team in linescore_container.find_all("div", class_="team__name")]
scores = [score.text.strip() for score in linescore_container.find_all("div", class_="team__score")]

# Capture values from AllIningsPBP__Wrapper class
innings_wrapper = soup.find("div", class_="AllIningsPBP__Wrapper")
innings = [inning.text.strip() for inning in innings_wrapper.find_all("div", class_="AllIningsPBP__Inning")]
totals = [total.text.strip() for total in innings_wrapper.find_all("div", class_="AllIningsPBP__Total")]

# Print captured values
print("title:", soup.title.text)
print("teams:", teams)
print("scores:", scores)
print("innings:", innings)
print("totals:", totals)

////////////////////////////////////

输出

Output is blank and it does  not capture the AllIningsPBP__Wrapper values 
title: Sportsnet.ca
teams: []
scores: ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
innings: []
totals: []
pitchers: []

/////////////////

import requests
from bs4 import BeautifulSoup

url = "https://www.mlb.com/gameday/orioles-vs-pirates/2024/04/07/745523/final/wrap"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find and extract the desired data
    title = soup.title.text.strip()
    teams = [team.text.strip() for team in soup.find_all("span", class_="team-name")]
    scores = [score.text.strip() for score in soup.find_all("span", class_="team-score")]

    innings_table = soup.find("table", class_="linescore-table")
    if innings_table:
        innings_data = []
        for row in innings_table.find_all("tr"):
            inning_cells = row.find_all("td")
            inning_values = [cell.text.strip() for cell in inning_cells]
            innings_data.append(inning_values)

    innings_description = []
    innings_description_wrapper = soup.find("div", class_="AllInningsPBP__Wrapper")
    if innings_description_wrapper:
        innings_description = [item.text.strip() for item in innings_description_wrapper.find_all("div", class_="AllIningsPBP__Inning")]

    # Print the extracted data
    print("Title:", title)
    print("Teams:", teams)
    print("Scores:", scores)
    print("Innings Data:", innings_data)
    print("Innings Description:", innings_description)
else:
    print("Failed to retrieve data from the URL.")

//////////////////

import requests
from bs4 import BeautifulSoup

url = "https://www.thebaseballcube.com/content/box/CHN202303300~r/"

# Add a header to mimic a real browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36"
}

# Send a GET request to the URL with headers
response = requests.get(url, headers=headers)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Extract title
    title = soup.title.text.strip()

    # Extract teams
    teams = [team.text.strip() for team in soup.find_all("span", class_="bold")]

    # Extract scores
    scores = [score.text.strip() for score in soup.find_all("td", class_="bold")]

    # Extract innings data
    innings_data = []
    for inning in soup.find_all("tr", class_="box_line_score"):
        inning_text = " ".join(cell.text.strip() for cell in inning.find_all("td"))
        innings_data.append(inning_text)

    # Print the extracted data
    print("Title:", title)
    print("Teams:", teams)
    print("Scores:", scores)
    print("Innings Data:", innings_data)
else:
    print("Failed to retrieve data from the URL.")

/////////////////////////

import requests
from bs4 import BeautifulSoup

url = "https://plaintextsports.com/mlb/2024-04-07/bal-pit#play-by-play"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the play-by-play section
    play_by_play_section = soup.find("section", id="play-by-play")

    if play_by_play_section:
        # Extract the play-by-play data
        play_by_play = play_by_play_section.text.strip()

        # Print the play-by-play data
        print("Play-by-Play:")
        print(play_by_play)
    else:
        print("Play-by-play section not found on the page.")
else:
    print("Failed to retrieve data from the URL.")

////////////////////

这些网站有某种安全措施吗? 我们可以绕过它们还是需要手动废弃数据? 我尝试了多个网站。

希望通过 Play Info 废弃盒子分数和所有游戏

Diamondbacks
4-6
Final
2 - 5
Atlanta
Braves
6-2

Play-by-Play   Box Score

      1  2  3  4  5  6  7  8  9    T  H  E
------------------------------------------
ARI   0  0  0  2  0  0  0  0  0    2  5  1
ATL   0  2  0  1  0  0  0  2  x    5  7  0
  
W: Chris Sale (1-0)
L: Ryne Nelson (0-2)
S: Pierce Johnson (1)
Game Time: 2:30
 
Play-by-play:              Only Scoring Plays
  1  2  3  4  5  6  7  8  9  
 
1st Inning:                              Hide
 
T1 
 
Ketel Marte strikes out swinging.
 
  0-0
 
T1 
 
Corbin Carroll grounds out to first baseman Matt Olson.
 
  0-0
 
T1 
 
Lourdes Gurriel Jr. lines out sharply to left fielder Jarred Kelenic.
 
  0-0
 
Middle 1st
 
B1 
 
Ronald Acuña Jr. pops out to first baseman Christian Walker.
 
  0-0
 
B1 
 
Ozzie Albies grounds out to first baseman Christian Walker.
 
  0-0
 
B1 
 
Austin Riley pops out to first baseman Christian Walker in foul territory.
 
  0-0
 
2nd Inning:                              Hide
 
T2 
 
Christian Walker strikes out swinging.
 
  0-0
python-3.x web-scraping beautifulsoup
1个回答
0
投票
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time

# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # To run Chrome in headless mode
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--no-sandbox")

# Set up the Chrome driver
service = Service("path/to/your/chromedriver")  # Replace with the path to your 
 chromedriver
 driver = webdriver.Chrome(service=service, options=chrome_options)

 # URL of the webpage to scrape
 url = "https://plaintextsports.com/mlb/2024-04-07/bal-pit#play-by-play"

 # Load the webpage
 driver.get(url)

# Wait for the play-by-play section to load
time.sleep(5)  # Adjust the waiting time as needed

# Find the play-by-play section
play_by_play_section = driver.find_element(By.ID, "play-by-play")

# Get the play-by-play text
play_by_play_text = play_by_play_section.text

# Print the play-by-play data
print("Play-by-play:")
print(play_by_play_text)

# Close the browser
driver.quit()

希望这有帮助

© www.soinside.com 2019 - 2024. All rights reserved.