UnicodeEncodeError:“ascii”编解码器无法对位置 16 中的字符“\xed”进行编码:序数不在范围(128)中

问题描述 投票:0回答:1

我正在学习 pandas 并尝试在 Jupyter 中获取维基百科页面的表格,但出现此错误:

UnicodeEncodeError:“ascii”编解码器无法对位置 16 中的字符“\xed”进行编码:序数不在范围(128)中

df_mx = pd.read_html('https://es.wikipedia.org/wiki/Economía_de_México', match='Indicadores macroeconómicos, financieros y de bienestar')

我明白了

---------------------------------------------------------------------------
UnicodeEncodeError                        Traceback (most recent call last)
Cell In[14], line 1
----> 1 df_mx = pd.read_html('https://es.wikipedia.org/wiki/Economía_de_México', match='Indicadores macroeconómicos, financieros y de bienestar')

File ~/anaconda3/lib/python3.11/site-packages/pandas/io/html.py:1212, in read_html(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only, extract_links, dtype_backend)
   1208 check_dtype_backend(dtype_backend)
   1210 io = stringify_path(io)
-> 1212 return _parse(
   1213     flavor=flavor,
   1214     io=io,
   1215     match=match,
   1216     header=header,
   1217     index_col=index_col,
   1218     skiprows=skiprows,
   1219     parse_dates=parse_dates,
   1220     thousands=thousands,
   1221     attrs=attrs,
   1222     encoding=encoding,
   1223     decimal=decimal,
   1224     converters=converters,
   1225     na_values=na_values,
   1226     keep_default_na=keep_default_na,
   1227     displayed_only=displayed_only,
   1228     extract_links=extract_links,
   1229     dtype_backend=dtype_backend,
   1230 )

File ~/anaconda3/lib/python3.11/site-packages/pandas/io/html.py:1001, in _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs)
    999 else:
   1000     assert retained is not None  # for mypy
-> 1001     raise retained
   1003 ret = []
   1004 for table in tables:

File ~/anaconda3/lib/python3.11/site-packages/pandas/io/html.py:981, in _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs)
    978 p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links)
    980 try:
--> 981     tables = p.parse_tables()
    982 except ValueError as caught:
    983     # if `io` is an io-like object, check if it's seekable
    984     # and try to rewind it before trying the next parser
    985     if hasattr(io, "seekable") and io.seekable():

File ~/anaconda3/lib/python3.11/site-packages/pandas/io/html.py:257, in _HtmlFrameParser.parse_tables(self)
    249 def parse_tables(self):
    250     """
    251     Parse and return all tables from the DOM.
    252 
   (...)
    255     list of parsed (header, body, footer) tuples from tables.
    256     """
--> 257     tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
    258     return (self._parse_thead_tbody_tfoot(table) for table in tables)

File ~/anaconda3/lib/python3.11/site-packages/pandas/io/html.py:666, in _BeautifulSoupHtml5LibFrameParser._build_doc(self)
    663 def _build_doc(self):
    664     from bs4 import BeautifulSoup
--> 666     bdoc = self._setup_build_doc()
    667     if isinstance(bdoc, bytes) and self.encoding is not None:
    668         udoc = bdoc.decode(self.encoding)

File ~/anaconda3/lib/python3.11/site-packages/pandas/io/html.py:658, in _BeautifulSoupHtml5LibFrameParser._setup_build_doc(self)
    657 def _setup_build_doc(self):
--> 658     raw_text = _read(self.io, self.encoding)
    659     if not raw_text:
    660         raise ValueError(f"No text parsed from document: {self.io}")

File ~/anaconda3/lib/python3.11/site-packages/pandas/io/html.py:155, in _read(obj, encoding)
    149 text: str | bytes
    150 if (
    151     is_url(obj)
    152     or hasattr(obj, "read")
    153     or (isinstance(obj, str) and file_exists(obj))
    154 ):
--> 155     with get_handle(obj, "r", encoding=encoding) as handles:
    156         text = handles.handle.read()
    157 elif isinstance(obj, (str, bytes)):

File ~/anaconda3/lib/python3.11/site-packages/pandas/io/common.py:716, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    713     codecs.lookup_error(errors)
    715 # open URLs
--> 716 ioargs = _get_filepath_or_buffer(
    717     path_or_buf,
    718     encoding=encoding,
    719     compression=compression,
    720     mode=mode,
    721     storage_options=storage_options,
    722 )
    724 handle = ioargs.filepath_or_buffer
    725 handles: list[BaseBuffer]

File ~/anaconda3/lib/python3.11/site-packages/pandas/io/common.py:368, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
    366 # assuming storage_options is to be interpreted as headers
    367 req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options)
--> 368 with urlopen(req_info) as req:
    369     content_encoding = req.headers.get("Content-Encoding", None)
    370     if content_encoding == "gzip":
    371         # Override compression based on Content-Encoding header

File ~/anaconda3/lib/python3.11/site-packages/pandas/io/common.py:270, in urlopen(*args, **kwargs)
    264 """
    265 Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of
    266 the stdlib.
    267 """
    268 import urllib.request
--> 270 return urllib.request.urlopen(*args, **kwargs)

File ~/anaconda3/lib/python3.11/urllib/request.py:216, in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    214 else:
    215     opener = _opener
--> 216 return opener.open(url, data, timeout)

File ~/anaconda3/lib/python3.11/urllib/request.py:519, in OpenerDirector.open(self, fullurl, data, timeout)
    516     req = meth(req)
    518 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
--> 519 response = self._open(req, data)
    521 # post-process response
    522 meth_name = protocol+"_response"

File ~/anaconda3/lib/python3.11/urllib/request.py:536, in OpenerDirector._open(self, req, data)
    533     return result
    535 protocol = req.type
--> 536 result = self._call_chain(self.handle_open, protocol, protocol +
    537                           '_open', req)
    538 if result:
    539     return result

File ~/anaconda3/lib/python3.11/urllib/request.py:496, in OpenerDirector._call_chain(self, chain, kind, meth_name, *args)
    494 for handler in handlers:
    495     func = getattr(handler, meth_name)
--> 496     result = func(*args)
    497     if result is not None:
    498         return result

File ~/anaconda3/lib/python3.11/urllib/request.py:1391, in HTTPSHandler.https_open(self, req)
   1390 def https_open(self, req):
-> 1391     return self.do_open(http.client.HTTPSConnection, req,
   1392         context=self._context, check_hostname=self._check_hostname)

File ~/anaconda3/lib/python3.11/urllib/request.py:1348, in AbstractHTTPHandler.do_open(self, http_class, req, **http_conn_args)
   1346 try:
   1347     try:
-> 1348         h.request(req.get_method(), req.selector, req.data, headers,
   1349                   encode_chunked=req.has_header('Transfer-encoding'))
   1350     except OSError as err: # timeout error
   1351         raise URLError(err)

File ~/anaconda3/lib/python3.11/http/client.py:1286, in HTTPConnection.request(self, method, url, body, headers, encode_chunked)
   1283 def request(self, method, url, body=None, headers={}, *,
   1284             encode_chunked=False):
   1285     """Send a complete request to the server."""
-> 1286     self._send_request(method, url, body, headers, encode_chunked)

File ~/anaconda3/lib/python3.11/http/client.py:1297, in HTTPConnection._send_request(self, method, url, body, headers, encode_chunked)
   1294 if 'accept-encoding' in header_names:
   1295     skips['skip_accept_encoding'] = 1
-> 1297 self.putrequest(method, url, **skips)
   1299 # chunked encoding will happen if HTTP/1.1 is used and either
   1300 # the caller passes encode_chunked=True or the following
   1301 # conditions hold:
   1302 # 1. content-length has not been explicitly set
   1303 # 2. the body is a file or iterable, but not a str or bytes-like
   1304 # 3. Transfer-Encoding has NOT been explicitly set by the caller
   1306 if 'content-length' not in header_names:
   1307     # only chunk body if not explicitly set for backwards
   1308     # compatibility, assuming the client code is already handling the
   1309     # chunking

File ~/anaconda3/lib/python3.11/http/client.py:1135, in HTTPConnection.putrequest(self, method, url, skip_host, skip_accept_encoding)
   1131 self._validate_path(url)
   1133 request = '%s %s %s' % (method, url, self._http_vsn_str)
-> 1135 self._output(self._encode_request(request))
   1137 if self._http_vsn == 11:
   1138     # Issue some standard headers for better HTTP/1.1 compliance
   1140     if not skip_host:
   1141         # this header is issued *only* for HTTP/1.1
   1142         # connections. more specifically, this means it is
   (...)
   1152         # but the host of the actual URL, not the host of the
   1153         # proxy.

File ~/anaconda3/lib/python3.11/http/client.py:1215, in HTTPConnection._encode_request(self, request)
   1213 def _encode_request(self, request):
   1214     # ASCII also helps prevent CVE-2019-9740.
-> 1215     return request.encode('ascii')

UnicodeEncodeError: 'ascii' codec can't encode character '\xed' in position 16: ordinal not in range(128)

我尝试放入 .encode('UTF-8') 但它不起作用

python pandas encode
1个回答
0
投票

我无法单独使用 pandas 成功读取页面(凭据错误)。使用

requests
模块读取页面并将文本内容传递给 pandas 是可行的,但是
match
需要是表格本身中的文本,而不是维基百科标题,并且需要指定标题行:

import requests
import pandas as pd
import io

response = requests.get('https://es.wikipedia.org/wiki/Econom%C3%ADa_de_M%C3%A9xico')
matches = pd.read_html(io.StringIO(response.text), match='Crecimiento del PIB', header=0)
for match in matches:
    print(match)

输出:

     Año  PIB PPA (miles de mdd)  PIB PPA per cápita Crecimiento del PIB Tasa de inflación Desempleo Deuda pública
0   1980                   403.6                5818                9.5%             26.5%      1.2%           NaN
1   1981                   478.8                6745                8.5%             27.9%      0.9%           NaN
2   1982                   505.9                6968               -0.5%             59.2%      4.2%           NaN
3   1983                   507.6                6841               -3.5%            101.8%      6.1%           NaN
4   1984                   543.5                7172                3.4%              5.6%       NaN           NaN
5   1985                   573.2                7409                2.2%             57.8%      4.4%           NaN
6   1986                   566.7                7180               -3.1%             86.4%      4.3%           NaN
7   1987                   591.2                7344                1.7%            132.0%      3.9%           NaN
8   1988                   619.8                7550                1.3%            113.5%      3.5%           NaN
9   1989                   670.3                8008                4.1%             19.5%      2.9%           NaN
10  1990                   731.1                8397                5.2%             26.7%      2.7%           NaN
11  1991                   787.2                8882                4.2%             22.6%      2.7%           NaN
12  1992                   833.7                9250                3.5%             15.5%      2.8%           NaN
13  1993                   876.9                9573                2.7%              9.8%      3.4%           NaN
14  1994                   939.8               10099                4.9%              7.0%      3.7%           NaN
15  1995                   899.0                9515               −6.3%             35.1%      6.2%           NaN
16  1996                   977.5               10195                6.8%             34.4%      5.5%         44.7%
17  1997                  1062.2               10928                6.8%             20.6%      3.7%         40.9%
18  1998                  1129.2               11466                5.2%             15.9%      3.2%         42.0%
19  1999                  1178.0               11815                2.8%             16.6%      2.5%         44.7%
20  2000                  1264.4               12532                4.9%              9.5%      2.2%         40.6%
21  2001                  1288.0               12612               −0.4%              6.4%      2.8%         39.4%
22  2002                  1307.2               12641                0.0%              5.0%      3.0%         41.7%
23  2003                  1352.6               12916                1.4%              4.5%      3.4%         40.2%
24  2004                  1444.3               13631                3.9%              4.7%      3.4%         40.2%
25  2005                  1525.1               14234                2.3%              4.0%      3.5%         38.5%
26  2006                  1642.7               15153                4.5%              3.6%      3.5%         37.4%
27  2007                  1725.1               15713                2.3%             4.0 %      3.6%         37.2%
28  2008                  1779.0               15984                1.1%              5.1%      3.9%         42.5%
29  2009                  1697.7               15044               −5.3%              5.3%      5.3%         43.7%
30  2010                  1806.5               15811                5.1%              4.2%      5.3%         42.0%
31  2011                  1911.3               16522                3.7%              3.4%      5.2%         42.9%
32  2012                  2017.5               17235                3.6%              4.1%      4.9%         45.9%
33  2013                  2077.8               17549                1.4%              3.8%      4.9%         45.9%
34  2014                  2175.2               18170                2.8%              3.8%      4.8%         48.9%
35  2015                  2270.7               18765                3.3%              2.7%      4.4%         52.9%
36  2016                  2366.7               19356                2.9%              2.8%      3.9%         56.8%
37  2017                  2458.4               19927                2.1%              6.0%      3.4%         54.1%
38  2018                  2571.6               20616                2.0%              4.9%      3.3%         53.6%
39  2019                  2658.0               20868               -0.1%              3.6%      3.4%         53.3%
40  2020                  2440.0               19130               -8.2%              3.1%      4.4%         60.6%
41  2021                  2613.7               20270                5.8%              3.5%      3.6%         60.5%
42  2022                  3060.0               23550                3.9%              7.8%      3.3%         54.1%
© www.soinside.com 2019 - 2024. All rights reserved.