如何使用Python从网站下载/使用多个CSV文件

问题描述 投票:0回答:1

我一直在尝试从此网站下载日前市场 ExPost LMPs csv 文件:

https://www.misoenergy.org/markets-and-operations/real-time--market-data/market-reports#nt=/MarketReportType:Historical%20LMP/MarketReportName:Day-Ahead%20Market%20ExPost% 20LMP%20(csv)

我需要 Excel 文件中各个列和行的信息,以便稍后在我的项目中使用。因此,我通过编写这段代码开始了这个过程。但是,我收到了一些错误消息,并且由于我仍在学习 Python,所以我不确定如何解决这些错误。我也试图远离使用 ChatGPT,这样我就可以有效地学习,这就是我在这里问的原因。任何帮助,将不胜感激;谢谢。

这是我在 Jupyter Labs 中编写的代码片段:

import urllib3
import pandas as pd
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
url = 'https://www.misoenergy.org/markets-and-operations/real-time--market-data/market-reports#nt=/MarketReportType:Historical%20LMP/MarketReportName:Day-Ahead%20Market%20ExPost%20LMPs%20(csv)'
df = pd.read_csv('https://www.misoenergy.org/markets-and-operations/real-time--market-data/market-reports#nt=/MarketReportType:Historical%20LMP/MarketReportName:Day-Ahead%20Market%20ExPost%20LMPs%20(csv)', skiprows = 5)
df.head() 

以下是我收到的错误消息:

---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
File /lib/python311.zip/urllib/request.py:1348, in AbstractHTTPHandler.do_open(self, http_class, req, **http_conn_args)
   1347 try:
-> 1348     h.request(req.get_method(), req.selector, req.data, headers,
   1349               encode_chunked=req.has_header('Transfer-encoding'))
   1350 except OSError as err: # timeout error

File /lib/python311.zip/http/client.py:1283, in HTTPConnection.request(self, method, url, body, headers, encode_chunked)
   1282 """Send a complete request to the server."""
-> 1283 self._send_request(method, url, body, headers, encode_chunked)

File /lib/python311.zip/http/client.py:1329, in HTTPConnection._send_request(self, method, url, body, headers, encode_chunked)
   1328     body = _encode(body, 'body')
-> 1329 self.endheaders(body, encode_chunked=encode_chunked)

File /lib/python311.zip/http/client.py:1278, in HTTPConnection.endheaders(self, message_body, encode_chunked)
   1277     raise CannotSendHeader()
-> 1278 self._send_output(message_body, encode_chunked=encode_chunked)

File /lib/python311.zip/http/client.py:1038, in HTTPConnection._send_output(self, message_body, encode_chunked)
   1037 del self._buffer[:]
-> 1038 self.send(msg)
   1040 if message_body is not None:
   1041 
   1042     # create a consistent interface to message_body

File /lib/python311.zip/http/client.py:976, in HTTPConnection.send(self, data)
    975 if self.auto_open:
--> 976     self.connect()
    977 else:

File /lib/python311.zip/http/client.py:1448, in HTTPSConnection.connect(self)
   1446 "Connect to a host on a given (SSL) port."
-> 1448 super().connect()
   1450 if self._tunnel_host:

File /lib/python311.zip/http/client.py:942, in HTTPConnection.connect(self)
    941 sys.audit("http.client.connect", self, self.host, self.port)
--> 942 self.sock = self._create_connection(
    943     (self.host,self.port), self.timeout, self.source_address)
    944 # Might fail in OSs that don't implement TCP_NODELAY

File /lib/python311.zip/socket.py:851, in create_connection(address, timeout, source_address, all_errors)
    850 if not all_errors:
--> 851     raise exceptions[0]
    852 raise ExceptionGroup("create_connection failed", exceptions)

File /lib/python311.zip/socket.py:836, in create_connection(address, timeout, source_address, all_errors)
    835     sock.bind(source_address)
--> 836 sock.connect(sa)
    837 # Break explicitly a reference cycle

OSError: [Errno 23] Host is unreachable

During handling of the above exception, another exception occurred:

URLError                                  Traceback (most recent call last)
Cell In[26], line 5
      3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
      4 url = 'https://www.misoenergy.org/markets-and-operations/real-time--market-data/market-reports#nt=/MarketReportType:Historical%20LMP/MarketReportName:Day-Ahead%20Market%20ExPost%20LMPs%20(csv)'
----> 5 df = pd.read_csv('https://www.misoenergy.org/markets-and-operations/real-time--market-data/market-reports#nt=/MarketReportType:Historical%20LMP/MarketReportName:Day-Ahead%20Market%20ExPost%20LMPs%20(csv)', skiprows = 5)
      6 df.head() 

File /lib/python3.11/site-packages/pandas/util/_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
    209     else:
    210         kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)

File /lib/python3.11/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    325 if len(args) > num_allow_args:
    326     warnings.warn(
    327         msg.format(arguments=_format_argument_list(allow_args)),
    328         FutureWarning,
    329         stacklevel=find_stack_level(),
    330     )
--> 331 return func(*args, **kwargs)

File /lib/python3.11/site-packages/pandas/io/parsers/readers.py:950, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    935 kwds_defaults = _refine_defaults_read(
    936     dialect,
    937     delimiter,
   (...)
    946     defaults={"delimiter": ","},
    947 )
    948 kwds.update(kwds_defaults)
--> 950 return _read(filepath_or_buffer, kwds)

File /lib/python3.11/site-packages/pandas/io/parsers/readers.py:605, in _read(filepath_or_buffer, kwds)
    602 _validate_names(kwds.get("names", None))
    604 # Create the parser.
--> 605 parser = TextFileReader(filepath_or_buffer, **kwds)
    607 if chunksize or iterator:
    608     return parser

File /lib/python3.11/site-packages/pandas/io/parsers/readers.py:1442, in TextFileReader.__init__(self, f, engine, **kwds)
   1439     self.options["has_index_names"] = kwds["has_index_names"]
   1441 self.handles: IOHandles | None = None
-> 1442 self._engine = self._make_engine(f, self.engine)

File /lib/python3.11/site-packages/pandas/io/parsers/readers.py:1735, in TextFileReader._make_engine(self, f, engine)
   1733     if "b" not in mode:
   1734         mode += "b"
-> 1735 self.handles = get_handle(
   1736     f,
   1737     mode,
   1738     encoding=self.options.get("encoding", None),
   1739     compression=self.options.get("compression", None),
   1740     memory_map=self.options.get("memory_map", False),
   1741     is_text=is_text,
   1742     errors=self.options.get("encoding_errors", "strict"),
   1743     storage_options=self.options.get("storage_options", None),
   1744 )
   1745 assert self.handles is not None
   1746 f = self.handles.handle

File /lib/python3.11/site-packages/pandas/io/common.py:713, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    710     codecs.lookup_error(errors)
    712 # open URLs
--> 713 ioargs = _get_filepath_or_buffer(
    714     path_or_buf,
    715     encoding=encoding,
    716     compression=compression,
    717     mode=mode,
    718     storage_options=storage_options,
    719 )
    721 handle = ioargs.filepath_or_buffer
    722 handles: list[BaseBuffer]

File /lib/python3.11/site-packages/pandas/io/common.py:363, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
    361 # assuming storage_options is to be interpreted as headers
    362 req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options)
--> 363 with urlopen(req_info) as req:
    364     content_encoding = req.headers.get("Content-Encoding", None)
    365     if content_encoding == "gzip":
    366         # Override compression based on Content-Encoding header

File /lib/python3.11/site-packages/pandas/io/common.py:265, in urlopen(*args, **kwargs)
    259 """
    260 Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of
    261 the stdlib.
    262 """
    263 import urllib.request
--> 265 return urllib.request.urlopen(*args, **kwargs)

File /lib/python311.zip/urllib/request.py:216, in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    214 else:
    215     opener = _opener
--> 216 return opener.open(url, data, timeout)

File /lib/python311.zip/urllib/request.py:519, in OpenerDirector.open(self, fullurl, data, timeout)
    516     req = meth(req)
    518 sys.audit('urllib.Request', req.full_url, req.data, req.headers, req.get_method())
--> 519 response = self._open(req, data)
    521 # post-process response
    522 meth_name = protocol+"_response"

File /lib/python311.zip/urllib/request.py:536, in OpenerDirector._open(self, req, data)
    533     return result
    535 protocol = req.type
--> 536 result = self._call_chain(self.handle_open, protocol, protocol +
    537                           '_open', req)
    538 if result:
    539     return result

File /lib/python311.zip/urllib/request.py:496, in OpenerDirector._call_chain(self, chain, kind, meth_name, *args)
    494 for handler in handlers:
    495     func = getattr(handler, meth_name)
--> 496     result = func(*args)
    497     if result is not None:
    498         return result

File /lib/python311.zip/urllib/request.py:1391, in HTTPSHandler.https_open(self, req)
   1390 def https_open(self, req):
-> 1391     return self.do_open(http.client.HTTPSConnection, req,
   1392         context=self._context, check_hostname=self._check_hostname)

File /lib/python311.zip/urllib/request.py:1351, in AbstractHTTPHandler.do_open(self, http_class, req, **http_conn_args)
   1348         h.request(req.get_method(), req.selector, req.data, headers,
   1349                   encode_chunked=req.has_header('Transfer-encoding'))
   1350     except OSError as err: # timeout error
-> 1351         raise URLError(err)
   1352     r = h.getresponse()
   1353 except:

URLError: <urlopen error [Errno 23] Host is unreachable>

尽管尝试了各种修复方法,但我尚未找到解决方案。我也提到了该网站上的许多其他解决方案,但是这些解决方案似乎也不适合我的情况。任何提示/帮助将不胜感激。

python pandas csv
1个回答
0
投票

这里有两件事:

  1. url
    变量未使用
  2. 第 5 行的字符串参数(应指向 csv 的 url)指向一个 html 页面,其中包含指向不同 csv 的链接,但不是 csv 本身。

因此,考虑到这些事情,下面的代码应该可以工作。请注意,该 url 现在任意为您最初使用的 url 链接的第一个 csv:

import urllib3
import pandas as pd
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
df = pd.read_csv('https://docs.misoenergy.org/marketreports/20240524_da_expost_lmp.csv', skiprows = 5)
df.head() 
© www.soinside.com 2019 - 2024. All rights reserved.