我需要对网站进行数据抓取,但我的代码无法正常工作

问题描述 投票:0回答:0

使用 jupyter notebook 我正在尝试构建一个程序,该程序将进入该网站“https://npiprofile.com/clia/”,并在列名称“CLIA Number”下从 CSV 输入 CLIA ID。它将获取此列单元格中的数据,然后在网站中进行搜索。在此之后,将弹出 NPI 下的可点击链接,我需要程序点击它。完成后,需要收集:

“NPI”、“提供者名称”、“位置地址”、“分类”、“授权官方名称”、“授权官方头衔”和“授权官方电话”

代码运行但从未像以前的尝试那样打开网站提取数据。我有大约 20,000 个我必须提取的 clia 号码,所以任何帮助将不胜感激,因为我完全被困住了。

我使用的代码如下:




#import all files
import pandas as pd
import requests
from bs4 import BeautifulSoup

df = pd.read_csv('CLIA POL')
clia_numbers = df['CLIA Number']

base_url = 'https://npiprofile.com/clia/'

# Function to extract data from the NPI page
def extract_data(npi_soup):
    data = {}

    data['NPI'] = npi_soup.find('span', {'class': 'npi'}).text.strip()
    data['Provider Name'] = npi_soup.find('h1', {'class': 'name'}).text.strip()
    data['Location Address'] = npi_soup.find('div', {'class': 'location'}).find('p', {'class': 'address'}).text.strip()
    data['Classification'] = npi_soup.find('div', {'class': 'taxonomy'}).find('span', {'class': 'classification'}).text.strip()

    authorized_official = npi_soup.find('div', {'class': 'authorized-official'})
    data['Authorized Official Name'] = authorized_official.find('span', {'class': 'name'}).text.strip()
    data['Authorized Official Title'] = authorized_official.find('span', {'class': 'title'}).text.strip()
    data['Authorized Official Phone'] = authorized_official.find('span', {'class': 'phone'}).text.strip()

    return data


# Function to get the NPI page from the CLIA page
def get_npi_page(clia_number):
    url = base_url + clia_number

    response = requests.get(url, timeout=30)  # Increase the timeout duration
    soup = BeautifulSoup(response.content, 'html.parser')

    npi_link = soup.find('a', {'class': 'npi'})

    if npi_link:
        npi_url = npi_link['href']
        npi_response = requests.get(npi_url, timeout=30)  # Increase the timeout duration
        npi_soup = BeautifulSoup(npi_response.content, 'html.parser')

        return npi_soup
    else:
        return None


# Create an empty DataFrame to store the extracted data
data_columns = ['NPI', 'Provider Name', 'Location Address', 'Classification', 'Authorized Official Name', 'Authorized Official Title', 'Authorized Official Phone']
extracted_data = pd.DataFrame(columns=data_columns)

# Iterate through the CLIA numbers, get the NPI page, and extract the data
for clia_number in clia_numbers:
    npi_soup = get_npi_page(clia_number)

    if npi_soup:
        data = extract_data(npi_soup)
        extracted_data = extracted_data.append(data, ignore_index=True)

# Save the extracted data to a new CSV file
extracted_data.to_csv('extracted_data.csv', index=False)

我得到的错误信息如下:

TimeoutError                              Traceback (most recent call last)
~\anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    702             # Make the request on the httplib connection object.
--> 703             httplib_response = self._make_request(
    704                 conn,

~\anaconda3\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    385         try:
--> 386             self._validate_conn(conn)
    387         except (SocketTimeout, BaseSSLError) as e:

~\anaconda3\lib\site-packages\urllib3\connectionpool.py in _validate_conn(self, conn)
   1041         if not getattr(conn, "sock", None):  # AppEngine might not have  `.sock`
-> 1042             conn.connect()
   1043 

~\anaconda3\lib\site-packages\urllib3\connection.py in connect(self)
    413 
--> 414         self.sock = ssl_wrap_socket(
    415             sock=conn,

~\anaconda3\lib\site-packages\urllib3\util\ssl_.py in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir, key_password, ca_cert_data, tls_in_tls)
    448     if send_sni:
--> 449         ssl_sock = _ssl_wrap_socket_impl(
    450             sock, context, tls_in_tls, server_hostname=server_hostname

~\anaconda3\lib\site-packages\urllib3\util\ssl_.py in _ssl_wrap_socket_impl(sock, ssl_context, tls_in_tls, server_hostname)
    492     if server_hostname:
--> 493         return ssl_context.wrap_socket(sock, server_hostname=server_hostname)
    494     else:

~\anaconda3\lib\ssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session)
    500         # ctx._wrap_socket()
--> 501         return self.sslsocket_class._create(
    502             sock=sock,

~\anaconda3\lib\ssl.py in _create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session)
   1040                         raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
-> 1041                     self.do_handshake()
   1042             except (OSError, ValueError):

~\anaconda3\lib\ssl.py in do_handshake(self, block)
   1309                 self.settimeout(None)
-> 1310             self._sslobj.do_handshake()
   1311         finally:

TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond

During handling of the above exception, another exception occurred:

ProtocolError                             Traceback (most recent call last)
~\anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    488             if not chunked:
--> 489                 resp = conn.urlopen(
    490                     method=request.method,

~\anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    786 
--> 787             retries = retries.increment(
    788                 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]

~\anaconda3\lib\site-packages\urllib3\util\retry.py in increment(self, method, url, response, error, _pool, _stacktrace)
    549             if read is False or not self._is_method_retryable(method):
--> 550                 raise six.reraise(type(error), error, _stacktrace)
    551             elif read is not None:

~\anaconda3\lib\site-packages\urllib3\packages\six.py in reraise(tp, value, tb)
    768             if value.__traceback__ is not tb:
--> 769                 raise value.with_traceback(tb)
    770             raise value

~\anaconda3\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
    702             # Make the request on the httplib connection object.
--> 703             httplib_response = self._make_request(
    704                 conn,

~\anaconda3\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
    385         try:
--> 386             self._validate_conn(conn)
    387         except (SocketTimeout, BaseSSLError) as e:

~\anaconda3\lib\site-packages\urllib3\connectionpool.py in _validate_conn(self, conn)
   1041         if not getattr(conn, "sock", None):  # AppEngine might not have  `.sock`
-> 1042             conn.connect()
   1043 

~\anaconda3\lib\site-packages\urllib3\connection.py in connect(self)
    413 
--> 414         self.sock = ssl_wrap_socket(
    415             sock=conn,

~\anaconda3\lib\site-packages\urllib3\util\ssl_.py in ssl_wrap_socket(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir, key_password, ca_cert_data, tls_in_tls)
    448     if send_sni:
--> 449         ssl_sock = _ssl_wrap_socket_impl(
    450             sock, context, tls_in_tls, server_hostname=server_hostname

~\anaconda3\lib\site-packages\urllib3\util\ssl_.py in _ssl_wrap_socket_impl(sock, ssl_context, tls_in_tls, server_hostname)
    492     if server_hostname:
--> 493         return ssl_context.wrap_socket(sock, server_hostname=server_hostname)
    494     else:

~\anaconda3\lib\ssl.py in wrap_socket(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session)
    500         # ctx._wrap_socket()
--> 501         return self.sslsocket_class._create(
    502             sock=sock,

~\anaconda3\lib\ssl.py in _create(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session)
   1040                         raise ValueError("do_handshake_on_connect should not be specified for non-blocking sockets")
-> 1041                     self.do_handshake()
   1042             except (OSError, ValueError):

~\anaconda3\lib\ssl.py in do_handshake(self, block)
   1309                 self.settimeout(None)
-> 1310             self._sslobj.do_handshake()
   1311         finally:

ProtocolError: ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None))

During handling of the above exception, another exception occurred:

ConnectionError                           Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_13720\1785300690.py in <module>
     39 # Iterate through the CLIA numbers, get the NPI page, and extract the data
     40 for clia_number in clia_numbers:
---> 41     npi_soup = get_npi_page(clia_number)
     42 `your text```your text``
     43     if npi_soup:

~\AppData\Local\Temp\ipykernel_13720\1785300690.py in get_npi_page(clia_number)
     19     url = base_url + clia_number
     20 
---> 21     response = requests.get(url)
     22     soup = BeautifulSoup(response.content, 'html.parser')
     23 

~\anaconda3\lib\site-packages\requests\api.py in get(url, params, **kwargs)
     71     """
     72 
---> 73     return request("get", url, params=params, **kwargs)
     74 
     75 

~\anaconda3\lib\site-packages\requests\api.py in request(method, url, **kwargs)
     57     # cases, and look like a memory leak in others.
     58     with sessions.Session() as session:
---> 59         return session.request(method=method, url=url, **kwargs)
     60 
     61 

~\anaconda3\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    585         }
    586         send_kwargs.update(settings)
--> 587         resp = self.send(prep, **send_kwargs)
    588 
    589         return resp

~\anaconda3\lib\site-packages\requests\sessions.py in send(self, request, **kwargs)
    699 
    700         # Send the request
--> 701         r = adapter.send(request, **kwargs)
    702 
    703         # Total elapsed time of the request (approximately)

~\anaconda3\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
    545 
    546         except (ProtocolError, OSError) as err:
--> 547             raise ConnectionError(err, request=request)
    548 
    549         except MaxRetryError as e:

ConnectionError: ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None))

我尝试增加等待时间,但从未完全奏效。任何帮助将不胜感激,因为我完全被困住了。

python web-scraping data-science text-extraction
© www.soinside.com 2019 - 2024. All rights reserved.