我创建了一个Web scraper,当我从实例变量打印结果时,“td”元素没有条带化。我该如何删除它们。我试过了
cols = [item.replace("'<td>", "") for item in cols]
但那没用。
代码是这样的:
def __init__(self):
pages = range(1, 3000, 1)
self.url = 'https://marknadssok.fi.se/publiceringsklient?Page={}'.format(pages)
def scrape_site(self):
#All Columns
self.datum = []
#Establish connection
r = requests.get(self.url)
html = BeautifulSoup(r.content, "html.parser")
#Append each column to it's attribute
table_body=html.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [x.text.strip() for x in cols]
self.datum.append(row('td')[0:1]
print(self.datum)
我还有几个实例变量,但这里没有包含它们。我对附加的灵感来自于一个帖子,有人在使用twitter.api进行抓取时使用了类似的方法。
像这样的东西???:
from bs4 import BeautifulSoup
import requests
class Test(object):
def __init__(self):
pages = range(1, 3, 1)
self.url = 'https://marknadssok.fi.se/publiceringsklient?Page={}'.format(pages)
print(pages)
def scrape_site(self):
#All Columns
self.datum = []
#Establish connection
r = requests.get(self.url)
html = BeautifulSoup(r.content, "html.parser")
#Append each column to it's attribute
table_body=html.find('tbody')
rows = table_body.find_all('tr')
#print('Row:', rows)
for row in rows:
#print("ROW: ", row)
cols = row.find_all('td')
# for td in cols:
# print('COLS:', td.text)
cols = [x.text.strip() for x in cols]
# print("COLS2:", cols)
self.datum.append(cols[0:1])
print(self.datum)
def __main__():
t = Test()
t.scrape_site()
if __name__ == "__main__":
__main__()