这个逻辑用scrapy能搞定吗?

问题描述 投票:0回答:0

我做了一个项目,在这个项目中,我遍历了我大学里的每个学生 ID,以获取每个学生的成绩,为每个学生创建分析仪表板,并通过电子邮件将他们的结果发送给他们,并在稍后完成一份不错的报告。我抓取了我们学院上传结果的网站。 它的代码是这样的:

#Importing The Neccessary modules
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

#Reading the data
our_ids = pd.read_excel("All Our IDs.xlsx")

total_students = our_ids.shape[0]

df_to_hold_all_data = pd.DataFrame()


#Defining Functions to use in the script
def make_request(student_id):


    """
    Makes a response for the student ID given, Keeps repeating it till it's a successful response.

    """

    url = 'http://app1.helwan.edu.eg/Commerce/HasasnUpMlist.asp' #Base URL to our college website

    params = {
        'z_dep': '=',
        'z_st_name': 'LIKE',
        'z_st_settingno': '=',
        'x_st_settingno': f'{student_id}',
        'x_st_name': '',
        'z_gro': '=',
        'x_gro': '',
        'x_dep': '',
        'z_sec': 'LIKE',
        'x_sec': '',
        'Submit': '++++حفظ++++'
    }

    response_state =  0

    while response_state != 200 :

        try:
            response = requests.get(url,params= params, timeout= 10 )
            
        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout):
            print("Requesting Again...")
            continue

        response_state = response.status_code
    
    return response

def make_the_second_request_with_selenium(link):
    # Create a headless Edge driver
    options = Options()
    options.add_argument('--headless')
    driver = webdriver.Edge(options=options)

    # Set timeout for the request and try to navigate to a website 
    timeout = 10  # seconds

    try: 
        driver.get(link)

        WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.XPATH,'/html/body/form/div/table[1]/tbody/tr[3]/td[2]/div/font/b')))

        return driver # Will Eventually return this.

    except (TimeoutException, NoSuchElementException): # If the request takes more than 10 seconds or the request failed for any reason, repeat the request again 
        
        print("Requesting Again...")
        make_the_second_request_with_selenium(link) 

this_loop = 0

#Looping for all students
for student_id in our_ids['IDS'].unique():

    print(f"\nNow Looping for {student_id}\n")
    response = make_request(student_id) # Making our response
    print(f"{response.status_code}")
    # Parse the response and create a BeautifulSoup object 
    soup = BeautifulSoup(response.text, 'html.parser') 

    links = soup.find_all('a',{'href': True})

    link_to_natega = ''

    for link in links:

        if "StdCode" in link['href']:
            # get the link we want to go to eventually, Each Student has a unique link.
            link_to_natega = f"http://app1.helwan.edu.eg/Commerce/{link['href']}"

    print(link_to_natega)
    
    try:
        driver = make_the_second_request_with_selenium(link_to_natega)

        name = driver.find_element(By.XPATH,'/html/body/form/div/table[1]/tbody/tr[3]/td[2]/div/font/b').text
        id_of_student = driver.find_element(By.XPATH,'/html/body/form/div/table[1]/tbody/tr[3]/td[4]/div/font/b').text
        department = driver.find_element(By.XPATH,'/html/body/form/div/table[1]/tbody/tr[5]/td[2]/div/font/b').text
        first_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[3]/td[2]/div/font/b').text 
        first_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[3]/td[4]/div/font/b').text
        second_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[4]/td[2]/div/font/b').text
        second_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[4]/td[4]/div/font/b').text
        third_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[5]/td[2]/div/font/b').text
        third_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[5]/td[4]/div/font/b').text
        fourth_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[6]/td[2]/div/font/b').text
        fourth_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[6]/td[4]/div/font/b').text
        fifth_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[7]/td[2]/div/font/b').text
        fifth_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[7]/td[4]/div/font/b').text
        sixth_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[8]/td[2]/div/font/b').text
        sixth_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[8]/td[4]/div/font/b').text
        
        data = {'name': name , 'ID' : id_of_student , "Department" : department , \
            "Subject" : [first_sub,second_sub,third_sub,fourth_sub,fifth_sub,sixth_sub],\
                "Score": [first_sub_score,second_sub_score,third_sub_score,fourth_sub_score,fifth_sub_score,sixth_sub_score]
        }
        
        df = pd.DataFrame(data) #Create a DataFrame

        df_to_hold_all_data = df_to_hold_all_data.append(df) # Append it to the dataframe we created above.

        # Close the driver
        driver.quit()
        print(f"The shape of the data now is: {df_to_hold_all_data.shape}")
    except:
        print(f'failed to get data for {student_id}')

    this_loop += 1
    remaining_students = total_students - this_loop
    print(f'Done Looping For {student_id} The remaining students: {remaining_students}')


df_to_hold_all_data.to_excel("All Our Results.xlsx",index=False)

我不知道是否可以用scrapy创建这个? 如果是,它会使过程更快多少? 是否值得投入时间和精力来学习它并重新编写代码?

编辑:对不起,结构不佳,数据分析和统计是我真正擅长的部分:D

您的帮助将不胜感激。

python python-requests scrapy
© www.soinside.com 2019 - 2024. All rights reserved.