首先,我要抓取包含不同作业名称(带有链接)的'td',我想保存要从这些'td'链接中再次抓取的数据(这些作业的数据)从他们的网页)到不同的txt文件中。我希望将每个网页的抓取数据分别保存在不同的txt文件中。我可以这样做吗?如果您有任何想法,请分享您的想法!
import requests
from bs4 import BeautifulSoup
main = "https://deltaimmigration.com.au/Australia-jobs/"
def First():
r = requests.get(main)
soup = BeautifulSoup(r.text, 'html5lib')
links = []
with open("links.txt", 'w', newline="", encoding="UTF-8") as f:
for item in soup.findAll("td", {'width': '250'}):
item = item.contents[1].get("href")[3:]
item = f"https://deltaimmigration.com.au/{item}"
f.write(item+"\n")
links.append(item)
print(f"We Have Collected {len(links)} urls")
return links
def Second():
links = First()
with requests.Session() as req:
for link in links:
print(f"Extracting {link}")
r = req.get(link,timeout = 100)
soup = BeautifulSoup(r.text, 'html5lib')
for item in soup.findAll("table", {'width': '900'}):
print("*" * 40)
print(item.text)
print("*" * 40)
def Third():
r = requests.get(main)
soup = BeautifulSoup(r.text, 'html5lib')
result = Second()
for item in soup.findAll("td", {'width': '250'}):
with open(item.text + '.txt', 'w', newline="", encoding="UTF-8") as f:
f.write('result')
Third()
import requests
from bs4 import BeautifulSoup
main = "https://deltaimmigration.com.au/Australia-jobs/"
def First():
r = requests.get(main)
soup = BeautifulSoup(r.text, 'html5lib')
links = []
names = []
with open("links.txt", 'w', newline="", encoding="UTF-8") as f:
for item in soup.findAll("td", {'width': '250'}):
name = item.contents[1].text
item = item.contents[1].get("href")[3:]
item = f"https://deltaimmigration.com.au/{item}"
f.write(item+"\n")
links.append(item)
names.append(name)
print(f"We Have Collected {len(links)} urls")
return links, names
def Second():
links, names = First()
with requests.Session() as req:
for link, name in zip(links, names):
print(f"Extracting {link}")
r = req.get(link)
soup = BeautifulSoup(r.text, 'html5lib')
for item in soup.findAll("table", {'width': '900'}):
with open(f"{name}.txt", 'w', newline="", encoding="UTF-8") as f:
f.write(item.text)
Second()