使用Python从URL下载m3u8

问题描述 投票:0回答:2

我开始使用 Python 学习网络抓取。目前,我想下载日本饮食的视频。 (https://www.shugiintv.go.jp/jp/index.php?ex=VL&deli_id=40124&media_type=)

视频似乎有一种机制,可以从playlist.m3u8中调用chunklist.m3u8,然后按顺序调用chunklist.m3u8中描述的ts文件。

我想先从playlist.m3u8 URL下载内容,然后调用chunklist.m3u8按顺序并连接下载ts文件。

但是,我尝试下载 Playlist.m3u8,但它没有生成我期望的文本。

另外,playlist.m3u8的示例URL在这里↓

http://hlsvod.shugiintv.go.jp/vod/_definst_/amlst:2011/2011-1207-0900-12/playlist.m3u8

代码:

import requests

url = "http://hlsvod.shugiintv.go.jp/vod/_definst_/amlst:2011/2011-1207-0900-12/playlist.m3u8"
res = requests.get(url)
print(res.text)

例外文字:

#EXTM3U
#EXT-X-VERSION:3
#EXT-X-STREAM-INF:BANDWIDTH=564000,NAME="500k",RESOLUTION=640x360
chunklist_w60346572_b564000_t64NTAwaw==.m3u8

实际文字:

<html><head><title>Wowza Streaming Engine 4 Perpetual Bundle Unlimited Edition 4.7.7 build20181108145350</title></head><body>Wowza Streaming Engine 4 Perpetual Bundle Unlimited Edition 4.7.7 build20181108145350</body></html>

我认为URL中的冒号有问题,但我没有明确的解决方案。我想知道如何避免 URL 问题并成功下载 playlist.m3u8 中的文本。谢谢。

版本:

Python 3.7.9

请求2.25.1

python web-scraping python-requests m3u8
2个回答
1
投票

您的网址有问题:

>>> url = "http://hlsvod.shugiintv.go.jp/vod/_definst_/amlst:2011/2011-1207-0900-12/playlist.m3u8"
>>> res = requests.get(url)
>>> res.request.url
'https://hlsvod.shugiintv.go.jp/vod/_definst_/amlst:2011/2011-1207-0900-12/playlist.m3u8%20'

看到最后的“%20”了吗?

我不太确定你是怎么弄错的,但是复制粘贴应该可以:

url = 'https://hlsvod.shugiintv.go.jp/vod/_definst_/amlst:2011/2011-1207-0900-12/playlist.m3u8'

0
投票

您可以尝试下载 m3u8、任何加密密钥和 ts 文件部分。


import sys
import os
import requests

OUTPUT_FOLDER = 'output'  # default output folder name


HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0)"
    " Gecko/20100101 Firefox/57.0",
    "Accept": "*/*",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br",
    "DNT": "1",
    "Connection": "keep-alive",
    "Pragma": "no-cache",
    "Cache-Control": "no-cache"
}


def downloadM3u8(m3u8url, headers=HEADERS, depth=0):
    """ recursively download m3u8 files"""
    if not os.path.isdir(OUTPUT_FOLDER):
        os.mkdir(OUTPUT_FOLDER)
    base_url = '/'.join(m3u8url.split('/')[0:-1]) + '/' # get the base url
    print('processing: {}'.format(m3u8url))
    m3u8 = requests.get(m3u8url, headers=HEADERS) # get the m3u8 file
    folder = m3u8url.split('/')[-2] # get the filename
    parent_folder = None
    if depth > 0:
        parent_folder = m3u8url.split('/')[-3]
    filename = m3u8url.split('/')[-1].split('?')[0] # get the filename
    path_parts = list(filter(lambda x : x is not None, [
        OUTPUT_FOLDER,
        parent_folder,
        folder,
    ]))

    target_path = os.path.join(*path_parts, filename)

    if not os.path.isdir(os.path.join(*path_parts)):
        os.mkdir(os.path.join(*path_parts))
    with open(target_path, 'wb') as f:
        print('writing file to {}'.format(target_path))
        f.write(m3u8.content)

    
    # Download encrypted key files
    key_urls = extractKeyUrls(m3u8)
    print('key_urls', key_urls)
    for key_url in key_urls:
        key_filename = key_url.split('/')[-1].split('?')[0]
        key_file = requests.get(base_url + key_url, headers=HEADERS)
        with open(os.path.join(*path_parts, key_filename), 'wb') as f:
            f.write(key_file.content)

    ts_urls = extractTsUrls(m3u8) # get all the .ts urls
    print('ts_urls', ts_urls)
    # list the .ts files if they exist in the dir
    # list contents of the directory
    ts_target_dir = os.path.join(*path_parts)
    ts_files = set(filter(lambda x: '.ts' in x, os.listdir(ts_target_dir)))
    print('all ts files existing: {}'.format(ts_files))
    if len(ts_files) > 0:
        ts_urls = list(filter(lambda x: x.split('?')[0] not in ts_files, ts_urls))
    for ts in ts_urls:
        ts_url = base_url + ts
        print('downloading: {}'.format(ts_url))
        ts_filename = ts.split('?')[0]
        ts_file = requests.get(ts_url, headers=HEADERS)
        with open(os.path.join(*path_parts, ts_filename), 'wb') as f:
            f.write(ts_file.content)
    child_urls = extractM3u8Urls(m3u8) # get all the urls in the m3u8 file
    all_urls = []
    print('child_urls', child_urls)
    for child in child_urls:
        new_url = base_url + child
        all_urls.append(new_url)
        subchildren = downloadM3u8(new_url, headers=HEADERS, depth=depth + 1)
        print('subchildren', subchildren)
        all_urls.extend(subchildren)
    return all_urls

def extractTsUrls(m3):
    """ get a list of .ts urls from the m3u8 file """
    lines = m3.text.split('\n')
    urls = []
    for line in lines:
        if '.ts' in line:
            urls.append(line)
    return urls

def extractM3u8Urls(m3):
    """ get a list of m3u8 urls from the m3u8 file """
    lines = m3.text.split('\n')
    urls = []
    for line in lines:
        if '.m3u8' in line:
            urls.append(line)
    return urls


def extractKeyUrls(m3):
  """ get a list of key urls from the m3u8 file """
  lines = m3.text.split('\n')
  urls = []
  for line in lines:
    match = re.search(r'URI="([^"]+)"', line)
    if match:
      urls.append(match.group(1))
  return urls


if __name__ == "__main__":
    downloadM3u8(sys.argv[0], headers=HEADERS)
    print('done')

© www.soinside.com 2019 - 2024. All rights reserved.