我试着把它分别作为链接和日期来做,但我遇到了数据帧计数不匹配的问题,我想知道如何合并这两个列表。我决定同时提取链接和日期,但现在我不能得到任何结果。
我的数据框架应该只有链接和报告年月日。
下面是一个html的例子
<tr>
<td headers="view-dlf-1-title-table-column--G7-URXF07Ms" class="views-field views-field-dlf-1-title">
<a href="/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Contract-and-Enrollment-Summary-Report-Items/Contract-Summary-2013-03">Contract Summary</a> </td>
<td headers="view-dlf-2-report-period-table-column--G7Rqagd92Ho" class="views-field views-field-dlf-2-report-period">2013-03 </td>
</tr>
这是我现在的代码
import pandas as pd
from datetime import datetime
from lxml import html
import requests
def http_request_get(url, session=None, payload=None, parse=True):
""" Sends a GET HTTP request to a website and returns its HTML content and full url address. """
if payload is None:
payload = {}
if session:
content = session.get(url, params=payload, verify=False, headers={"content-type":"text"})
else:
content = requests.get(url, params=payload, verify=False, headers={"content-type":"text"})
content.raise_for_status() # Raise HTTPError for bad requests (4xx or 5xx)
if parse:
return html.fromstring(content.text), content.url
else:
return content.text, content.url
def get_html(link):
"""
Returns a html.
"""
page_parsed, _ = http_request_get(url=link, payload={'t': ''}, parse=True)
return page_parsed
cmslinks=[
'https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-Contract?items_per_page=100&items_per_page_options%5B5%5D=5%20per%20page&items_per_page_options%5B10%5D=10%20per%20page&items_per_page_options%5B25%5D=25%20per%20page&items_per_page_options%5B50%5D=50%20per%20page&items_per_page_options%5B100%5D=100%20per%20page&combine=&page=0',
'https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-Contract?items_per_page=100&items_per_page_options%5B5%5D=5%20per%20page&items_per_page_options%5B10%5D=10%20per%20page&items_per_page_options%5B25%5D=25%20per%20page&items_per_page_options%5B50%5D=50%20per%20page&items_per_page_options%5B100%5D=100%20per%20page&combine=&page=1']
for cmslink in cmslinks:
content, _ = http_request_get(url=cmslink,payload={'t':''},parse=True)
table = content.cssselect('table[class="views-table views-view-table cols-2"]')[0]
links = content.cssselect('td[headers="view-dlf-1-title-table-column"]')
urls = [row.get('href') for row in links]
date = [dict(zip('ReportTime', row.xpath('td//text()'))) for row in table[0:]]
df1 = pd.DataFrame(urls)
df2 = pd.DataFrame(date)
mergedDf = df2.merge(df1, left_index=True, right_index=True)
试试这个。
import pandas as pd
from datetime import datetime
from lxml import html
import requests
def http_request_get(url, session=None, payload=None, parse=True):
""" Sends a GET HTTP request to a website and returns its HTML content and full url address. """
if payload is None:
payload = {}
if session:
content = session.get(url, params=payload, verify=False, headers={"content-type":"text"})
else:
content = requests.get(url, params=payload, verify=False, headers={"content-type":"text"})
content.raise_for_status() # Raise HTTPError for bad requests (4xx or 5xx)
if parse:
return html.fromstring(content.text), content.url
else:
return content.text, content.url
def get_html(link):
"""
Returns a html.
"""
page_parsed, _ = http_request_get(url=link, payload={'t': ''}, parse=True)
return page_parsed
cmslinks=[
'https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-Contract?items_per_page=100&items_per_page_options%5B5%5D=5%20per%20page&items_per_page_options%5B10%5D=10%20per%20page&items_per_page_options%5B25%5D=25%20per%20page&items_per_page_options%5B50%5D=50%20per%20page&items_per_page_options%5B100%5D=100%20per%20page&combine=&page=0',
'https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-Contract?items_per_page=100&items_per_page_options%5B5%5D=5%20per%20page&items_per_page_options%5B10%5D=10%20per%20page&items_per_page_options%5B25%5D=25%20per%20page&items_per_page_options%5B50%5D=50%20per%20page&items_per_page_options%5B100%5D=100%20per%20page&combine=&page=1'
]
for cmslink in cmslinks:
content, _ = http_request_get(url=cmslink,payload={'t':''},parse=True)
table = content.cssselect('table[class="views-table views-view-table cols-2"]')
links = content.cssselect('td[headers="view-dlf-1-title-table-column"]')
urls = [row.xpath("//a[contains(text(),'Enrollment by Contract')]/@href") for row in links]
date = [dict(zip('ReportTime', row.xpath("//td[@class='views-field views-field-dlf-2-report-period']"))) for row in table[0:]]
df1 = pd.DataFrame(urls)
df2 = pd.DataFrame(date)
mergedDf = df2.merge(df1, left_index=True, right_index=True)
full_table=pd.DataFrame()
for cmslink in cmslinks:
content, _ = http_request_get(url=cmslink, payload={'t': ''}, parse=True)
table=pd.read_html(cmslink)[0]
links = content.cssselect('td[headers="view-dlf-1-title-table-column"]')
urls = links[0].xpath("//td/a[contains(text(),'')]/@href")
table['Title']=urls
full_table=full_table.append(table)
print(full_table)
我在这里会选择BeautifulSoup。它是相当简单的库,可以用来解析html。然后,只需抓取到的 <a>
的标签,有一个 href
(具体来说就是 "Enrollment-by-Contract"
链接)。) 然后,只要得到下一个 <td>
标签,用于下一个表格单元格中的文本。
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from lxml import html
import requests
def http_request_get(url, session=None, payload=None, parse=True):
""" Sends a GET HTTP request to a website and returns its HTML content and full url address. """
if payload is None:
payload = {}
if session:
content = session.get(url, params=payload, verify=False, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',"content-type":"text"})
else:
content = requests.get(url, params=payload, verify=False, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',"content-type":"text"})
content.raise_for_status() # Raise HTTPError for bad requests (4xx or 5xx)
if parse:
return BeautifulSoup(content.text, 'html.parser'), content.url
else:
return content.text, content.url
def get_html(link):
"""
Returns a html.
"""
page_parsed, _ = http_request_get(url=link, payload={'t': ''}, parse=True)
return page_parsed
cmslinks=[
'https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-Contract?items_per_page=100&items_per_page_options%5B5%5D=5%20per%20page&items_per_page_options%5B10%5D=10%20per%20page&items_per_page_options%5B25%5D=25%20per%20page&items_per_page_options%5B50%5D=50%20per%20page&items_per_page_options%5B100%5D=100%20per%20page&combine=&page=0',
'https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/MCRAdvPartDEnrolData/Monthly-Enrollment-by-Contract?items_per_page=100&items_per_page_options%5B5%5D=5%20per%20page&items_per_page_options%5B10%5D=10%20per%20page&items_per_page_options%5B25%5D=25%20per%20page&items_per_page_options%5B50%5D=50%20per%20page&items_per_page_options%5B100%5D=100%20per%20page&combine=&page=1']
df = pd.DataFrame()
for cmslink in cmslinks:
content, _ = http_request_get(url=cmslink,payload={'t':''},parse=True)
table = content.find('table')
links = table.find_all('a', href=True)
urls = [[row.get('href'), row.find_next('td').text.strip()] for row in links if 'Enrollment-by-Contract' in row.get('href')]
df = df.append(pd.DataFrame(urls), sort=False).reset_index(drop=True)
输出。
print (df)
0 1
0 /Research-Statistics-Data-and-Systems/Statisti... 2019-10
1 /Research-Statistics-Data-and-Systems/Statisti... 2019-09
2 /Research-Statistics-Data-and-Systems/Statisti... 2019-08
3 /Research-Statistics-Data-and-Systems/Statisti... 2019-07
4 /Research-Statistics-Data-and-Systems/Statisti... 2019-06
5 /Research-Statistics-Data-and-Systems/Statisti... 2019-05
6 /Research-Statistics-Data-and-Systems/Statisti... 2019-04
7 /Research-Statistics-Data-and-Systems/Statisti... 2019-03
8 /Research-Statistics-Data-and-Systems/Statisti... 2019-02
9 /Research-Statistics-Data-and-Systems/Statisti... 2019-01
10 /Research-Statistics-Data-and-Systems/Statisti... 2018-12
11 /Research-Statistics-Data-and-Systems/Statisti... 2018-11
12 /Research-Statistics-Data-and-Systems/Statisti... 2018-10
13 /Research-Statistics-Data-and-Systems/Statisti... 2018-09
14 /Research-Statistics-Data-and-Systems/Statisti... 2018-08
15 /Research-Statistics-Data-and-Systems/Statisti... 2018-07
16 /Research-Statistics-Data-and-Systems/Statisti... 2018-06
17 /Research-Statistics-Data-and-Systems/Statisti... 2018-05
18 /Research-Statistics-Data-and-Systems/Statisti... 2018-04
19 /Research-Statistics-Data-and-Systems/Statisti... 2018-03
20 /Research-Statistics-Data-and-Systems/Statisti... 2018-02
21 /Research-Statistics-Data-and-Systems/Statisti... 2018-01
22 /Research-Statistics-Data-and-Systems/Statisti... 2017-12
23 /Research-Statistics-Data-and-Systems/Statisti... 2017-11
24 /Research-Statistics-Data-and-Systems/Statisti... 2017-10
25 /Research-Statistics-Data-and-Systems/Statisti... 2017-09
26 /Research-Statistics-Data-and-Systems/Statisti... 2017-08
27 /Research-Statistics-Data-and-Systems/Statisti... 2017-07
28 /Research-Statistics-Data-and-Systems/Statisti... 2017-06
29 /Research-Statistics-Data-and-Systems/Statisti... 2017-05
.. ... ...
129 /Research-Statistics-Data-and-Systems/Statisti... 2008-12
130 /Research-Statistics-Data-and-Systems/Statisti... 2008-11
131 /Research-Statistics-Data-and-Systems/Statisti... 2008-10
132 /Research-Statistics-Data-and-Systems/Statisti... 2008-09
133 /Research-Statistics-Data-and-Systems/Statisti... 2008-08
134 /Research-Statistics-Data-and-Systems/Statisti... 2008-07
135 /Research-Statistics-Data-and-Systems/Statisti... 2008-06
136 /Research-Statistics-Data-and-Systems/Statisti... 2008-05
137 /Research-Statistics-Data-and-Systems/Statisti... 2008-04
138 /Research-Statistics-Data-and-Systems/Statisti... 2008-03
139 /Research-Statistics-Data-and-Systems/Statisti... 2008-02
140 /Research-Statistics-Data-and-Systems/Statisti... 2008-01
141 /Research-Statistics-Data-and-Systems/Statisti... 2007-12
142 /Research-Statistics-Data-and-Systems/Statisti... 2007-11
143 /Research-Statistics-Data-and-Systems/Statisti... 2007-10
144 /Research-Statistics-Data-and-Systems/Statisti... 2007-09
145 /Research-Statistics-Data-and-Systems/Statisti... 2007-08
146 /Research-Statistics-Data-and-Systems/Statisti... 2007-07
147 /Research-Statistics-Data-and-Systems/Statisti... 2007-06
148 /Research-Statistics-Data-and-Systems/Statisti... 2007-05
149 /Research-Statistics-Data-and-Systems/Statisti... 2007-04
150 /Research-Statistics-Data-and-Systems/Statisti... 2007-03
151 /Research-Statistics-Data-and-Systems/Statisti... 2007-02
152 /Research-Statistics-Data-and-Systems/Statisti... 2007-01
153 /Research-Statistics-Data-and-Systems/Statisti... 2006-12
154 /Research-Statistics-Data-and-Systems/Statisti... 2006-11
155 /Research-Statistics-Data-and-Systems/Statisti... 2006-10
156 /Research-Statistics-Data-and-Systems/Statisti... 2006-09
157 /Research-Statistics-Data-and-Systems/Statisti... 2006-08
158 /Research-Statistics-Data-and-Systems/Statisti... 2012-11
[159 rows x 2 columns]