如何从同一个网页上抓取多个名称(即来自'td'的文本文件?

问题描述 投票:0回答:1

首先,我要抓取包含不同作业名称(带有链接)的'td',我想保存要从这些'td'链接中再次抓取的数据(这些作业的数据)从他们的网页)到不同的txt文件中。我希望将每个网页的抓取数据分别保存在不同的txt文件中。我可以这样做吗?如果您有任何想法,请分享您的想法!

import requests
from bs4 import BeautifulSoup

main = "https://deltaimmigration.com.au/Australia-jobs/"

def First():
    r = requests.get(main)
    soup = BeautifulSoup(r.text, 'html5lib')
    links = []
    with open("links.txt", 'w', newline="", encoding="UTF-8") as f:
        for item in soup.findAll("td", {'width': '250'}):
            item = item.contents[1].get("href")[3:]
            item = f"https://deltaimmigration.com.au/{item}"
            f.write(item+"\n")
            links.append(item)
    print(f"We Have Collected {len(links)} urls")
    return links

def Second():
    links = First() 
    with requests.Session() as req:
        for link in links:
            print(f"Extracting {link}")
            r = req.get(link,timeout = 100)
            soup = BeautifulSoup(r.text, 'html5lib')
            for item in soup.findAll("table", {'width': '900'}):
                print("*" * 40)
                print(item.text)
                print("*" * 40)

def Third():
    r = requests.get(main)
    soup = BeautifulSoup(r.text, 'html5lib')
    result = Second()
    for item in soup.findAll("td", {'width': '250'}):
        with open(item.text + '.txt', 'w', newline="", encoding="UTF-8") as f:
            f.write('result')           

Third()       
python beautifulsoup
1个回答
0
投票
import requests
from bs4 import BeautifulSoup

main = "https://deltaimmigration.com.au/Australia-jobs/"


def First():
    r = requests.get(main)
    soup = BeautifulSoup(r.text, 'html5lib')
    links = []
    names = []
    with open("links.txt", 'w', newline="", encoding="UTF-8") as f:
        for item in soup.findAll("td", {'width': '250'}):
            name = item.contents[1].text
            item = item.contents[1].get("href")[3:]
            item = f"https://deltaimmigration.com.au/{item}"
            f.write(item+"\n")
            links.append(item)
            names.append(name)
    print(f"We Have Collected {len(links)} urls")
    return links, names


def Second():
    links, names = First()
    with requests.Session() as req:
        for link, name in zip(links, names):
            print(f"Extracting {link}")
            r = req.get(link)
            soup = BeautifulSoup(r.text, 'html5lib')
            for item in soup.findAll("table", {'width': '900'}):
                with open(f"{name}.txt", 'w', newline="", encoding="UTF-8") as f:
                    f.write(item.text)


Second()
© www.soinside.com 2019 - 2024. All rights reserved.