我正在学习 pandas 并尝试在 Jupyter 中获取维基百科页面的表格,但出现此错误:
UnicodeEncodeError:“ascii”编解码器无法对位置 16 中的字符“\xed”进行编码:序数不在范围(128)中
df_mx = pd.read_html('https://es.wikipedia.org/wiki/Economía_de_México', match='Indicadores macroeconómicos, financieros y de bienestar')
我明白了
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
Cell In[14], line 1
----> 1 df_mx = pd.read_html('https://es.wikipedia.org/wiki/Economía_de_México', match='Indicadores macroeconómicos, financieros y de bienestar')
File ~/anaconda3/lib/python3.11/site-packages/pandas/io/html.py:1212, in read_html(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only, extract_links, dtype_backend)
1208 check_dtype_backend(dtype_backend)
1210 io = stringify_path(io)
-> 1212 return _parse(
1213 flavor=flavor,
1214 io=io,
1215 match=match,
1216 header=header,
1217 index_col=index_col,
1218 skiprows=skiprows,
1219 parse_dates=parse_dates,
1220 thousands=thousands,
1221 attrs=attrs,
1222 encoding=encoding,
1223 decimal=decimal,
1224 converters=converters,
1225 na_values=na_values,
1226 keep_default_na=keep_default_na,
1227 displayed_only=displayed_only,
1228 extract_links=extract_links,
1229 dtype_backend=dtype_backend,
1230 )
File ~/anaconda3/lib/python3.11/site-packages/pandas/io/html.py:1001, in _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs)
999 else:
1000 assert retained is not None # for mypy
-> 1001 raise retained
1003 ret = []
1004 for table in tables:
File ~/anaconda3/lib/python3.11/site-packages/pandas/io/html.py:981, in _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs)
978 p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links)
980 try:
--> 981 tables = p.parse_tables()
982 except ValueError as caught:
983 # if `io` is an io-like object, check if it's seekable
984 # and try to rewind it before trying the next parser
985 if hasattr(io, "seekable") and io.seekable():
File ~/anaconda3/lib/python3.11/site-packages/pandas/io/html.py:257, in _HtmlFrameParser.parse_tables(self)
249 def parse_tables(self):
250 """
251 Parse and return all tables from the DOM.
252
(...)
255 list of parsed (header, body, footer) tuples from tables.
256 """
--> 257 tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
258 return (self._parse_thead_tbody_tfoot(table) for table in tables)
File ~/anaconda3/lib/python3.11/site-packages/pandas/io/html.py:666, in _BeautifulSoupHtml5LibFrameParser._build_doc(self)
663 def _build_doc(self):
664 from bs4 import BeautifulSoup
--> 666 bdoc = self._setup_build_doc()
667 if isinstance(bdoc, bytes) and self.encoding is not None:
668 udoc = bdoc.decode(self.encoding)
File ~/anaconda3/lib/python3.11/site-packages/pandas/io/html.py:658, in _BeautifulSoupHtml5LibFrameParser._setup_build_doc(self)
657 def _setup_build_doc(self):
--> 658 raw_text = _read(self.io, self.encoding)
659 if not raw_text:
660 raise ValueError(f"No text parsed from document: {self.io}")
File ~/anaconda3/lib/python3.11/site-packages/pandas/io/html.py:155, in _read(obj, encoding)
149 text: str | bytes
150 if (
151 is_url(obj)
152 or hasattr(obj, "read")
153 or (isinstance(obj, str) and file_exists(obj))
154 ):
--> 155 with get_handle(obj, "r", encoding=encoding) as handles:
156 text = handles.handle.read()
157 elif isinstance(obj, (str, bytes)):
File ~/anaconda3/lib/python3.11/site-packages/pandas/io/common.py:716, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
713 codecs.lookup_error(errors)
715 # open URLs
--> 716 ioargs = _get_filepath_or_buffer(
717 path_or_buf,
718 encoding=encoding,
719 compression=compression,
720 mode=mode,
721 storage_options=storage_options,
722 )
724 handle = ioargs.filepath_or_buffer
725 handles: list[BaseBuffer]
File ~/anaconda3/lib/python3.11/site-packages/pandas/io/common.py:368, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
366 # assuming storage_options is to be interpreted as headers
367 req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options)
--> 368 with urlopen(req_info) as req:
369 content_encoding = req.headers.get("Content-Encoding", None)
370 if content_encoding == "gzip":
371 # Override compression based on Content-Encoding header
File ~/anaconda3/lib/python3.11/site-packages/pandas/io/common.py:270, in urlopen(*args, **kwargs)
264 """
265 Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of
266 the stdlib.
267 """
268 import urllib.request
--> 270 return urllib.request.urlopen(*args, **kwargs)
File ~/anaconda3/lib/python3.11/urllib/request.py:216, in urlopen(url, data, timeout, cafile, capath, cadefault, context)
214 else:
215 opener = _opener
--> 216 return opener.open(url, data, timeout)
File ~/anaconda3/lib/python3.11/urllib/request.py:519, in OpenerDirector.open(self, fullurl, data, timeout)
516 req = meth(req)
518 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
--> 519 response = self._open(req, data)
521 # post-process response
522 meth_name = protocol+"_response"
File ~/anaconda3/lib/python3.11/urllib/request.py:536, in OpenerDirector._open(self, req, data)
533 return result
535 protocol = req.type
--> 536 result = self._call_chain(self.handle_open, protocol, protocol +
537 '_open', req)
538 if result:
539 return result
File ~/anaconda3/lib/python3.11/urllib/request.py:496, in OpenerDirector._call_chain(self, chain, kind, meth_name, *args)
494 for handler in handlers:
495 func = getattr(handler, meth_name)
--> 496 result = func(*args)
497 if result is not None:
498 return result
File ~/anaconda3/lib/python3.11/urllib/request.py:1391, in HTTPSHandler.https_open(self, req)
1390 def https_open(self, req):
-> 1391 return self.do_open(http.client.HTTPSConnection, req,
1392 context=self._context, check_hostname=self._check_hostname)
File ~/anaconda3/lib/python3.11/urllib/request.py:1348, in AbstractHTTPHandler.do_open(self, http_class, req, **http_conn_args)
1346 try:
1347 try:
-> 1348 h.request(req.get_method(), req.selector, req.data, headers,
1349 encode_chunked=req.has_header('Transfer-encoding'))
1350 except OSError as err: # timeout error
1351 raise URLError(err)
File ~/anaconda3/lib/python3.11/http/client.py:1286, in HTTPConnection.request(self, method, url, body, headers, encode_chunked)
1283 def request(self, method, url, body=None, headers={}, *,
1284 encode_chunked=False):
1285 """Send a complete request to the server."""
-> 1286 self._send_request(method, url, body, headers, encode_chunked)
File ~/anaconda3/lib/python3.11/http/client.py:1297, in HTTPConnection._send_request(self, method, url, body, headers, encode_chunked)
1294 if 'accept-encoding' in header_names:
1295 skips['skip_accept_encoding'] = 1
-> 1297 self.putrequest(method, url, **skips)
1299 # chunked encoding will happen if HTTP/1.1 is used and either
1300 # the caller passes encode_chunked=True or the following
1301 # conditions hold:
1302 # 1. content-length has not been explicitly set
1303 # 2. the body is a file or iterable, but not a str or bytes-like
1304 # 3. Transfer-Encoding has NOT been explicitly set by the caller
1306 if 'content-length' not in header_names:
1307 # only chunk body if not explicitly set for backwards
1308 # compatibility, assuming the client code is already handling the
1309 # chunking
File ~/anaconda3/lib/python3.11/http/client.py:1135, in HTTPConnection.putrequest(self, method, url, skip_host, skip_accept_encoding)
1131 self._validate_path(url)
1133 request = '%s %s %s' % (method, url, self._http_vsn_str)
-> 1135 self._output(self._encode_request(request))
1137 if self._http_vsn == 11:
1138 # Issue some standard headers for better HTTP/1.1 compliance
1140 if not skip_host:
1141 # this header is issued *only* for HTTP/1.1
1142 # connections. more specifically, this means it is
(...)
1152 # but the host of the actual URL, not the host of the
1153 # proxy.
File ~/anaconda3/lib/python3.11/http/client.py:1215, in HTTPConnection._encode_request(self, request)
1213 def _encode_request(self, request):
1214 # ASCII also helps prevent CVE-2019-9740.
-> 1215 return request.encode('ascii')
UnicodeEncodeError: 'ascii' codec can't encode character '\xed' in position 16: ordinal not in range(128)
我尝试放入 .encode('UTF-8') 但它不起作用
我无法单独使用 pandas 成功读取页面(凭据错误)。使用
requests
模块读取页面并将文本内容传递给 pandas 是可行的,但是 match
需要是表格本身中的文本,而不是维基百科标题,并且需要指定标题行:
import requests
import pandas as pd
import io
response = requests.get('https://es.wikipedia.org/wiki/Econom%C3%ADa_de_M%C3%A9xico')
matches = pd.read_html(io.StringIO(response.text), match='Crecimiento del PIB', header=0)
for match in matches:
print(match)
输出:
Año PIB PPA (miles de mdd) PIB PPA per cápita Crecimiento del PIB Tasa de inflación Desempleo Deuda pública
0 1980 403.6 5818 9.5% 26.5% 1.2% NaN
1 1981 478.8 6745 8.5% 27.9% 0.9% NaN
2 1982 505.9 6968 -0.5% 59.2% 4.2% NaN
3 1983 507.6 6841 -3.5% 101.8% 6.1% NaN
4 1984 543.5 7172 3.4% 5.6% NaN NaN
5 1985 573.2 7409 2.2% 57.8% 4.4% NaN
6 1986 566.7 7180 -3.1% 86.4% 4.3% NaN
7 1987 591.2 7344 1.7% 132.0% 3.9% NaN
8 1988 619.8 7550 1.3% 113.5% 3.5% NaN
9 1989 670.3 8008 4.1% 19.5% 2.9% NaN
10 1990 731.1 8397 5.2% 26.7% 2.7% NaN
11 1991 787.2 8882 4.2% 22.6% 2.7% NaN
12 1992 833.7 9250 3.5% 15.5% 2.8% NaN
13 1993 876.9 9573 2.7% 9.8% 3.4% NaN
14 1994 939.8 10099 4.9% 7.0% 3.7% NaN
15 1995 899.0 9515 −6.3% 35.1% 6.2% NaN
16 1996 977.5 10195 6.8% 34.4% 5.5% 44.7%
17 1997 1062.2 10928 6.8% 20.6% 3.7% 40.9%
18 1998 1129.2 11466 5.2% 15.9% 3.2% 42.0%
19 1999 1178.0 11815 2.8% 16.6% 2.5% 44.7%
20 2000 1264.4 12532 4.9% 9.5% 2.2% 40.6%
21 2001 1288.0 12612 −0.4% 6.4% 2.8% 39.4%
22 2002 1307.2 12641 0.0% 5.0% 3.0% 41.7%
23 2003 1352.6 12916 1.4% 4.5% 3.4% 40.2%
24 2004 1444.3 13631 3.9% 4.7% 3.4% 40.2%
25 2005 1525.1 14234 2.3% 4.0% 3.5% 38.5%
26 2006 1642.7 15153 4.5% 3.6% 3.5% 37.4%
27 2007 1725.1 15713 2.3% 4.0 % 3.6% 37.2%
28 2008 1779.0 15984 1.1% 5.1% 3.9% 42.5%
29 2009 1697.7 15044 −5.3% 5.3% 5.3% 43.7%
30 2010 1806.5 15811 5.1% 4.2% 5.3% 42.0%
31 2011 1911.3 16522 3.7% 3.4% 5.2% 42.9%
32 2012 2017.5 17235 3.6% 4.1% 4.9% 45.9%
33 2013 2077.8 17549 1.4% 3.8% 4.9% 45.9%
34 2014 2175.2 18170 2.8% 3.8% 4.8% 48.9%
35 2015 2270.7 18765 3.3% 2.7% 4.4% 52.9%
36 2016 2366.7 19356 2.9% 2.8% 3.9% 56.8%
37 2017 2458.4 19927 2.1% 6.0% 3.4% 54.1%
38 2018 2571.6 20616 2.0% 4.9% 3.3% 53.6%
39 2019 2658.0 20868 -0.1% 3.6% 3.4% 53.3%
40 2020 2440.0 19130 -8.2% 3.1% 4.4% 60.6%
41 2021 2613.7 20270 5.8% 3.5% 3.6% 60.5%
42 2022 3060.0 23550 3.9% 7.8% 3.3% 54.1%