使用 jupyter notebook 我正在尝试构建一个程序,该程序将进入该网站“https://npiprofile.com/clia/”,并在列名称“CLIA Number”下从 CSV 输入 CLIA ID。它将获取此列单元格中的数据,然后在网站中进行搜索。在此之后,将弹出 NPI 下的可点击链接,我需要程序点击它。完成后,需要收集:
“NPI”、“提供者名称”、“位置地址”、“分类”、“授权官方名称”、“授权官方头衔”和“授权官方电话”
代码运行但从未像以前的尝试那样打开网站提取数据。我有大约 20,000 个我必须提取的 clia 号码,所以任何帮助将不胜感激,因为我完全被困住了。
我使用的代码如下:
#import all files
import pandas as pd
import requests
from bs4 import BeautifulSoup
df = pd.read_csv('CLIA POL')
clia_numbers = df['CLIA Number']
base_url = 'https://npiprofile.com/clia/'
# Function to extract data from the NPI page
def extract_data(npi_soup):
data = {}
data['NPI'] = npi_soup.find('span', {'class': 'npi'}).text.strip()
data['Provider Name'] = npi_soup.find('h1', {'class': 'name'}).text.strip()
data['Location Address'] = npi_soup.find('div', {'class': 'location'}).find('p', {'class': 'address'}).text.strip()
data['Classification'] = npi_soup.find('div', {'class': 'taxonomy'}).find('span', {'class': 'classification'}).text.strip()
authorized_official = npi_soup.find('div', {'class': 'authorized-official'})
data['Authorized Official Name'] = authorized_official.find('span', {'class': 'name'}).text.strip()
data['Authorized Official Title'] = authorized_official.find('span', {'class': 'title'}).text.strip()
data['Authorized Official Phone'] = authorized_official.find('span', {'class': 'phone'}).text.strip()
return data
# Function to get the NPI page from the CLIA page
def get_npi_page(clia_number):
url = base_url + clia_number
response = requests.get(url, timeout=30) # Increase the timeout duration
soup = BeautifulSoup(response.content, 'html.parser')
npi_link = soup.find('a', {'class': 'npi'})
if npi_link:
npi_url = npi_link['href']
npi_response = requests.get(npi_url, timeout=30) # Increase the timeout duration
npi_soup = BeautifulSoup(npi_response.content, 'html.parser')
return npi_soup
else:
return None
# Create an empty DataFrame to store the extracted data
data_columns = ['NPI', 'Provider Name', 'Location Address', 'Classification', 'Authorized Official Name', 'Authorized Official Title', 'Authorized Official Phone']
extracted_data = pd.DataFrame(columns=data_columns)
# Iterate through the CLIA numbers, get the NPI page, and extract the data
for clia_number in clia_numbers:
npi_soup = get_npi_page(clia_number)
if npi_soup:
data = extract_data(npi_soup)
extracted_data = extracted_data.append(data, ignore_index=True)
# Save the extracted data to a new CSV file
extracted_data.to_csv('extracted_data.csv', index=False)
我得到的错误信息如下:
TimeoutError Traceback (most recent call last)
~\anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
702 # Make the request on the httplib connection object.
--> 703 httplib_response = self._make_request(
704 conn,
~\anaconda3\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
385 try:
--> 386 self._validate_conn(conn)
387 except (SocketTimeout, BaseSSLError) as e:
~\anaconda3\lib\site-packages\urllib3\connectionpool.py in _validate_conn(self, conn)
1041 if not getattr(conn, "sock", None): # AppEngine might not have `.sock`
-> 1042 conn.connect()
1043
~\anaconda3\lib\site-packages\urllib3\connection.py in connect(self)
413
--> 414 self.sock = ssl_wrap_socket(
415 sock=conn,
~\anaconda3\lib\site-packages\urllib3\util\ssl_.py in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir, key_password, ca_cert_data, tls_in_tls)
448 if send_sni:
--> 449 ssl_sock = _ssl_wrap_socket_impl(
450 sock, context, tls_in_tls, server_hostname=server_hostname
~\anaconda3\lib\site-packages\urllib3\util\ssl_.py in _ssl_wrap_socket_impl(sock, ssl_context, tls_in_tls, server_hostname)
492 if server_hostname:
--> 493 return ssl_context.wrap_socket(sock, server_hostname=server_hostname)
494 else:
~\anaconda3\lib\ssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session)
500 # ctx._wrap_socket()
--> 501 return self.sslsocket_class._create(
502 sock=sock,
~\anaconda3\lib\ssl.py in _create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session)
1040 raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
-> 1041 self.do_handshake()
1042 except (OSError, ValueError):
~\anaconda3\lib\ssl.py in do_handshake(self, block)
1309 self.settimeout(None)
-> 1310 self._sslobj.do_handshake()
1311 finally:
TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
During handling of the above exception, another exception occurred:
ProtocolError Traceback (most recent call last)
~\anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
488 if not chunked:
--> 489 resp = conn.urlopen(
490 method=request.method,
~\anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
786
--> 787 retries = retries.increment(
788 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
~\anaconda3\lib\site-packages\urllib3\util\retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
549 if read is False or not self._is_method_retryable(method):
--> 550 raise six.reraise(type(error), error, _stacktrace)
551 elif read is not None:
~\anaconda3\lib\site-packages\urllib3\packages\six.py in reraise(tp, value, tb)
768 if value.__traceback__ is not tb:
--> 769 raise value.with_traceback(tb)
770 raise value
~\anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
702 # Make the request on the httplib connection object.
--> 703 httplib_response = self._make_request(
704 conn,
~\anaconda3\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
385 try:
--> 386 self._validate_conn(conn)
387 except (SocketTimeout, BaseSSLError) as e:
~\anaconda3\lib\site-packages\urllib3\connectionpool.py in _validate_conn(self, conn)
1041 if not getattr(conn, "sock", None): # AppEngine might not have `.sock`
-> 1042 conn.connect()
1043
~\anaconda3\lib\site-packages\urllib3\connection.py in connect(self)
413
--> 414 self.sock = ssl_wrap_socket(
415 sock=conn,
~\anaconda3\lib\site-packages\urllib3\util\ssl_.py in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir, key_password, ca_cert_data, tls_in_tls)
448 if send_sni:
--> 449 ssl_sock = _ssl_wrap_socket_impl(
450 sock, context, tls_in_tls, server_hostname=server_hostname
~\anaconda3\lib\site-packages\urllib3\util\ssl_.py in _ssl_wrap_socket_impl(sock, ssl_context, tls_in_tls, server_hostname)
492 if server_hostname:
--> 493 return ssl_context.wrap_socket(sock, server_hostname=server_hostname)
494 else:
~\anaconda3\lib\ssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session)
500 # ctx._wrap_socket()
--> 501 return self.sslsocket_class._create(
502 sock=sock,
~\anaconda3\lib\ssl.py in _create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session)
1040 raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
-> 1041 self.do_handshake()
1042 except (OSError, ValueError):
~\anaconda3\lib\ssl.py in do_handshake(self, block)
1309 self.settimeout(None)
-> 1310 self._sslobj.do_handshake()
1311 finally:
ProtocolError: ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_13720\1785300690.py in <module>
39 # Iterate through the CLIA numbers, get the NPI page, and extract the data
40 for clia_number in clia_numbers:
---> 41 npi_soup = get_npi_page(clia_number)
42 `your text```your text``
43 if npi_soup:
~\AppData\Local\Temp\ipykernel_13720\1785300690.py in get_npi_page(clia_number)
19 url = base_url + clia_number
20
---> 21 response = requests.get(url)
22 soup = BeautifulSoup(response.content, 'html.parser')
23
~\anaconda3\lib\site-packages\requests\api.py in get(url, params, **kwargs)
71 """
72
---> 73 return request("get", url, params=params, **kwargs)
74
75
~\anaconda3\lib\site-packages\requests\api.py in request(method, url, **kwargs)
57 # cases, and look like a memory leak in others.
58 with sessions.Session() as session:
---> 59 return session.request(method=method, url=url, **kwargs)
60
61
~\anaconda3\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
585 }
586 send_kwargs.update(settings)
--> 587 resp = self.send(prep, **send_kwargs)
588
589 return resp
~\anaconda3\lib\site-packages\requests\sessions.py in send(self, request, **kwargs)
699
700 # Send the request
--> 701 r = adapter.send(request, **kwargs)
702
703 # Total elapsed time of the request (approximately)
~\anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
545
546 except (ProtocolError, OSError) as err:
--> 547 raise ConnectionError(err, request=request)
548
549 except MaxRetryError as e:
ConnectionError: ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None))
我尝试增加等待时间,但从未完全奏效。任何帮助将不胜感激,因为我完全被困住了。