使用 Selenium 抓取 Power BI 仪表板

问题描述 投票:0回答:1

我在使用 Selenium 抓取 Power BI Dashboard 时遇到问题。我似乎正确地抓取了 url,并且具有良好的代码结构,但代码无法成功解析第一个(作业名称)之后的所有列。

我不需要点击任何东西,只需向下滚动页面即可提取所有数据。

状态数据的长度只有 150,而作业名称是 362。然后下面的列“由状态许可、注册或认证”仅返回 len(licensed_data) 62。我停止输入滚动代码,因为我在走得很远之前收到上述错误。我保持 div 相同,因为页面始终具有相同的 html 结构。

如果有人能帮助我理解我为什么搞砸了,那将不胜感激。再说一遍,我只是想刮掉上面破折号中的表格。

import time
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver

option = webdriver.ChromeOptions()
option.add_argument("--start-maximized")
driver = webdriver.Chrome(options=option)
wait = WebDriverWait(driver, 10)

# Load the page
driver.get("https://app.powerbi.com/view?r=eyJrIjoiNzA0MGM4NGMtN2E5Ny00NDU3LWJiNzMtOWFlMGIyMDczZjg2IiwidCI6IjM4MmZiOGIwLTRkYzMtNDEwNy04MGJkLTM1OTViMjQzMmZhZSIsImMiOjZ9&pageName=ReportSection")


job_name_data_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Job Name"]')))
# Scroll down to the bottom of the page to load all the data
while True:
    # Scroll down using JavaScript
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1)  # Adjust sleep time according to your page load speed
    
    # Check if we have reached the bottom of the page
    if driver.execute_script("return window.innerHeight + window.scrollY") >= driver.execute_script("return document.body.scrollHeight"):
        break

# Extract the text from the Job Name data element after scrolling
job_name_data = job_name_data_element.text

## state
state_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="State"]')))
# Scroll down to the bottom of the page to load all the data
while True:
    # Scroll down using JavaScript
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1)  # Adjust sleep time according to your page load speed
    
    # Check if we have reached the bottom of the page
    if driver.execute_script("return window.innerHeight + window.scrollY") >= driver.execute_script("return document.body.scrollHeight"):
        break

state_data = state_column_element.text
## license
licensed_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Licensed"]')))
while True:
    # Scroll down using JavaScript
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1)  # Adjust sleep time according to your page load speed
    
    # Check if we have reached the bottom of the page
    if driver.execute_script("return window.innerHeight + window.scrollY") >= driver.execute_script("return document.body.scrollHeight"):
        break
licensed_data = licensed_column_element.text

len(licensed_data)

## education
education_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Education Requirement"]')))
while True:
    # Scroll down using JavaScript
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1)  # Adjust sleep time according to your page load speed
    
    # Check if we have reached the bottom of the page
    if driver.execute_script("return window.innerHeight + window.scrollY") >= driver.execute_script("return document.body.scrollHeight"):
        break
education_data = education_column_element.text

## training
training_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Amount of Training Required [In Hours]"]')))
training_data = training_column_element.text


## experience
experience_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Amount of Experience Required"]')))
experience_data = experience_column_element.text

## pro exam
exam_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Professional Exam"]')))
exam_data = exam_column_element.text

## renewal time
renewal_time_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Required Time of License Renewal (In Years)"]')))
renewal_time_data = renewal_time_column_element.text

## continious education
continious_education_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Continuing Education Requirement"]')))
continious_education_column_element_data = continious_education_column_element.text

## additional exams
additional_exams_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Additional Required Exams"]')))
additional_exams_column_element_data = additional_exams_column_element.text

## continious education
cost_of_licensure_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Cost of Initial Licensure (In Dollars)"]')))
cost_of_licensure_column_element_data = cost_of_licensure_column_element.text

## license renewal
license_renewal_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Cost of License Renewal (In Dollars)"]')))
license_renewal_column_element_data = license_renewal_column_element.text

## reciprocity
reciprocity_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Reciprocity or Endorsement"]')))
reciprocity_column_element_data = reciprocity_column_element.text


## character
character_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Good Moral Character Requirement"]')))
character_column_element_data = character_column_element.text

## blanket ban
ban_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Blanket Ban for Ex-Offenders"]')))
ban_column_element_data = ban_column_element.text

## rehab
rehab_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Rehabilitation Requirement"]')))
rehab_column_element_data = rehab_column_element.text

## rehab
rehab_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Rehabilitation Requirement"]')))
rehab_column_element_data = rehab_column_element.text

## relationship
relationship_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Relationship between Offense and Occupation"]')))
relationship_column_element_data = relationship_column_element.text

##  Limitations
limitations_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Limitations on Scope of Inquiry"]')))
limitations_column_element_data = limitations_column_element.text

##  age
age_column_element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[aria-label="Minimum Age (In Years)"]')))
age_column_element_data = age_column_element.text
python selenium-webdriver web-scraping
1个回答
0
投票

所以有很多问题...

  1. 您正在滚动整个页面,而不仅仅是包含所需数据的表格。
  2. 当您滚动>表格时< either horizontally or vertically, the elements (rows and/or columns) that move off the screen (are no longer visible) actually disappear from the DOM.

这将是一场噩梦。

话虽如此,我确实编写了一些基本代码来帮助您入门。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

url = 'https://app.powerbi.com/view?r=eyJrIjoiNzA0MGM4NGMtN2E5Ny00NDU3LWJiNzMtOWFlMGIyMDczZjg2IiwidCI6IjM4MmZiOGIwLTRkYzMtNDEwNy04MGJkLTM1OTViMjQzMmZhZSIsImMiOjZ9&pageName=ReportSection'
driver = webdriver.Chrome()
driver.maximize_window()
driver.get(url)

wait = WebDriverWait(driver, 10)

table = wait.until(EC.visibility_of_element_located((By.XPATH, "//div[@role='document'][.//div[text()='Job Name']]")))
headers = table.find_elements(By.CSS_SELECTOR, "div[role='columnheader']")
headers.pop(0) # clean up first header
# print(len(headers))

h = []
for header in headers:
    h.append(header.text.strip())

print(h)

rows = table.find_elements(By.CSS_SELECTOR, "div[role='row']")
rows.pop(0) # clean up empty row
for row in rows:
    cells = row.find_elements(By.CSS_SELECTOR, "div[role='gridcell']")
    if cells:
        cells.pop(0) # clean up empty cell
    # print(len(cells))
    c = []
    for cell in cells:
        c.append(cell.text)

    print(c)

输出

['Job Name', 'State', 'Licensed, Registered or Certified by State', 'Education Requirement', 'Amount of Training Required (In Hours)', 'Amount of Experience Required', 'Professional Exam', 'Required Time of License Renewal (In Years)', 'Continuing Education Requirement', 'Additional Required Exams', 'Cost of Initial Licensure (In Dollars)', 'Cost of License Renewal (In Dollars)', 'Reciprocity or Endorsement', 'Good Moral Character Requirement']
['Athletic Trainer', 'Alabama', 'Licensed', 'A bachelor’s degree is required (from an accredited academic institution or similarly recognized institution)', '0', '0', 'Yes, individuals must take an exam to attain licensure', '1', '26 hrs x 1 yr', '0', '505', '75', 'State does have statutory language allowing reciprocity or endorsement agreements', 'State does not have a “good moral character" clause']

...等等

© www.soinside.com 2019 - 2024. All rights reserved.