from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
html = urlopen(
"https://www.accessdata.fda.gov/scripts/drugshortages/default.cfm")
bsObj = BeautifulSoup(html, "lxml")
table = bsObj.find('table', id="cont")
rows = table.findAll("tr")
links = [a['href'] for a in table.find_all('a', href=True) if a.text]
new_links = []
for link in links:
new_links.append(("https://www.accessdata.fda.gov/scripts/drugshortages/"+link).replace(" ", "%20"))
href_rows = []
for link in new_links:
link = link.replace("®", "%C2%AE")
html = urlopen(link)
bsObj_href = BeautifulSoup(html, "lxml")
#bsObj_href = BeautifulSoup (html.decode('utf-8', 'ignore'))
div_href = bsObj_href.find("div",{"id":"accordion"})
href_rows.append(div_href.findAll("tr"))
csvFile = open("drug_shortage.csv", 'wt', newline='')
writer = csv.writer(csvFile)
try:
for row in rows:
csvRow = []
for cell in row.findAll(['td', 'th']):
csvRow.append(cell.get_text())
writer.writerow(csvRow)
finally:
csvFile.close()
你好,所以我创建了这样的两行。如果你去这个网站 https:/www.accessdata.fda.govscriptsdrugshortagesdefault.cfm他们有药品名称和状态列,当你点击药品名称时,你可以找到四个更多的列。我想组合在一起(根据药物名称)的顺序,所以它将药物名称,状态,演示,可用性,和估计短缺时间,相关信息,短缺原因(每FDASIA).但目前的代码只生成第一个(药物名称,状态).我试过了。
for row in rows,rows_href:
但随后我得到了AttributeError。ResultSet对象没有属性'findAll'。我得到了同样的错误
for row in rows_href:
有什么建议可以让我按照自己的想法生成?
你的代码太混乱了。
你得到了所有的行,接下来得到了所有的链接,接下来你试图得到所有的其他信息,但是这样你就不能控制哪些值要加入到行中。最大的问题是当某些行在子页面上没有数据时,你所有的数据都会移动到上面一行。
你应该从主页面的表中获取所有的行,然后使用 for-loop
分别处理每一行,只为这一行获取其他元素--只为这一行读取链接,只为这一行从子页面获取数据,等等,并将这一行的所有数据放在列表中作为子列表。[name, status, link, presentation, availability, related, reason]
. 之后你就可以进行下一步的工作,并且只对下一行的数据进行工作。
BTW: 因为子页面可能有很多行,所以我在子页面中创建了很多行。data
名称、状态相同,但其他数值不同的产品
[name, status, values from first row on subpage]
[name, status, values from second row on subpage]
[name, status, values from string row on subpage]
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
html = urlopen("https://www.accessdata.fda.gov/scripts/drugshortages/default.cfm")
bsObj = BeautifulSoup(html, "lxml")
# list for all rows with all values
data = []
# get table on main page
table = bsObj.find('table', {'id': 'cont'})
# work with every row separatelly
for row in table.find_all("tr")[1:]: # use `[1:]` to skip header
# get columns only in this row
cols = row.find_all('td')
# get name and url from first column
link = cols[0].find('a', href=True)
name = link.text.strip()
url = link['href']
url = "https://www.accessdata.fda.gov/scripts/drugshortages/" + url
url = url.replace(" ", "%20").replace("®", "%C2%AE")
print('name:', name)
print('url:', url)
# get status from second column
status = cols[1].text.strip()
print('status:', status)
# subpage
html = urlopen(url)
bsObj_href = BeautifulSoup(html, "lxml")
subtable = bsObj_href.find("table")
if not subtable:
data.append([name, status, link, '', '', '', ''])
print('---')
else:
for subrows in subtable.find_all('tr')[1:]: # use `[1:]` to skip header
#print(subrows)
subcols = subrows.find_all('td')
presentation = subcols[0].text.strip()
availability = subcols[1].text.strip()
related = subcols[2].text.strip()
reason = subcols[3].text.strip()
data.append([name, status, link, presentation, availability, related, reason])
print(presentation, availability, related, reason)
print('---')
print('----------')
with open("drug_shortage.csv", 'wt', newline='') as csvfile:
writer = csv.writer(csvFile)
# write header - one row - using `writerow` without `s` at the end
#writer.writerow(['Name', 'Status', 'Link', 'Presentation', 'Availability', 'Related', 'Reason'])
# write data - many rowr - using `writerows` with `s` at the end
writer.writerows(data)
# no need to close because it use `with`