AttributeError:'str'对象没有属性'findAll',在使用BeautifulSoup从Youtube抓取数据中没有输出

问题描述 投票:2回答:1

我正在尝试从YouTube抓取热门视频。我无法从youtbue获取href标签。我的代码和预期输出如下。您将可以用url = 'https://www.youtube.com/watch?v=tL8AOS9ZRMg'提取数据,并注释将要使用的零件for link in youtubelinks: and correct the indentation。导入BeautifulSoup,urllib,ssl,os。我正在尝试转换为json格式并保存。

我的代码在下面

import urllib.request
import urllib.parse
import urllib.error
from bs4 import BeautifulSoup
import ssl
import json
import ast
import json
import os
from urllib.request import Request, urlopen

# For ignoring SSL certificate errors

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

# Input from user

#url = input('Enter Youtube Video Url- ')
#url = 'https://www.youtube.com/watch?v=MxnkDj8PIxQ'
url = 'https://www.youtube.com/feed/trending'
# Making the website believe that you are accessing it using a mozilla browser

req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()

# Creating a BeautifulSoup object of the html page for easy extraction of data.

soup = BeautifulSoup(webpage, 'html.parser')
html = soup.prettify('utf-8')
video_details = {}
other_details = {}

#All the trending youtube links
a = soup.select('a[href^="/watch?v="]')[:3]
youtubelinks = list(set("https://www.youtube.com" + do.get('href', None) for do in a))
youtubelinks
#print (doc)
for link in youtubelinks:
    for span in link.findAll('span',attrs={'class': 'watch-title'}):
        #video_details['TITLE'] = span.text.strip()
        video_details['TITLE'] = span.text.encode('utf8')

    for script in link.findAll('script',attrs={'type': 'application/ld+json'}):
            channelDesctiption = json.loads(script.text.strip())
            video_details['CHANNEL_NAME'] = channelDesctiption['itemListElement'][0]['item']['name']

    for div in link.findAll('div',attrs={'class': 'watch-view-count'}):
        video_details['NUMBER_OF_VIEWS'] = div.text.strip()

    for button in link.findAll('button',attrs={'title': 'I like this'}):
        video_details['LIKES'] = button.text.strip()

    for button in link.findAll('button',attrs={'title': 'I dislike this'}):
        video_details['DISLIKES'] = button.text.strip()

    for span in link.findAll('span',attrs={'class': 'yt-subscription-button-subscriber-count-branded-horizontal yt-subscriber-count'}):
        video_details['NUMBER_OF_SUBSCRIPTIONS'] = span.text.strip()

    hashtags = []
    for span in link.findAll('span',attrs={'class': 'standalone-collection-badge-renderer-text'}):
        for a in link.findAll('a',attrs={'class': 'yt-uix-sessionlink'}):
            hashtags.append(a.text.strip())
    video_details['HASH_TAGS'] = hashtags

    with open('output_file.html', 'wb') as file:
        file.write(html)

    with open('data.json', 'w', encoding='utf8') as outfile:
        json.dump(video_details, outfile, ensure_ascii=False,indent=4)

    print ('----------Extraction of data is complete. Check json file.----------')

我的期望值

{ 1. {
    "TITLE": "A",
    "CHANNEL_NAME": "B"
    "NUMBER_OF_VIEWS": "8,945 views",
    "LIKES": "71",
    "DISLIKES": "6",
    "NUMBER_OF_SUBSCRIPTIONS": "13.3K",
    "HASH_TAGS": [
        "#A",
        "#B",
        "#C"
    ]
}

2.{
    "TITLE": "D",
    "CHANNEL_NAME": "E",
    "NUMBER_OF_VIEWS": "8,945 views",
    "LIKES": "71K",
    "DISLIKES": "6K",
    "NUMBER_OF_SUBSCRIPTIONS": "1.3M",
    "HASH_TAGS": [
        "#M",
        "#F",
        "#G"
    ]
}
}
python xpath beautifulsoup request urllib
1个回答
2
投票
from bs4 import BeautifulSoup import ssl import json import ast import json import os from urllib.request import Request, urlopen # For ignoring SSL certificate errors ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE def get_soup(url): req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) webpage = urlopen(req).read() soup = BeautifulSoup(webpage, 'html.parser') return soup url = 'https://www.youtube.com/feed/trending' soup=get_soup(url) html = soup.prettify('utf-8') video_details = {} other_details = {} #All the trending youtube links youtubelinks = [] for a in soup.select('a[href^="/watch?v="]')[:1]: youtubelinks.append("https://www.youtube.com"+ a['href']) for link in youtubelinks: link=get_soup(link) for span in link.findAll('span',attrs={'class': 'watch-title'}): video_details['TITLE'] = span.text.strip() for script in link.findAll('script',attrs={'type': 'application/ld+json'}): channelDesctiption = json.loads(script.text.strip()) video_details['CHANNEL_NAME'] = channelDesctiption['itemListElement'][0]['item']['name'] for div in link.findAll('div',attrs={'class': 'watch-view-count'}): video_details['NUMBER_OF_VIEWS'] = div.text.strip() for button in link.findAll('button',attrs={'title': 'I like this'}): video_details['LIKES'] = button.text.strip() for button in link.findAll('button',attrs={'title': 'I dislike this'}): video_details['DISLIKES'] = button.text.strip() for span in link.findAll('span',attrs={'class': 'yt-subscription-button-subscriber-count-branded-horizontal yt-subscriber-count'}): video_details['NUMBER_OF_SUBSCRIPTIONS'] = span.text.strip() hashtags = [] for span in link.findAll('span',attrs={'class': 'standalone-collection-badge-renderer-text'}): for a in link.findAll('a',attrs={'class': 'yt-uix-sessionlink'}): hashtags.append(a.text.strip()) video_details['HASH_TAGS'] = hashtags print(video_details) with open('output_file.html', 'wb') as file: file.write(html) with open('data.json', 'w', encoding='utf8') as outfile: json.dump(video_details, outfile, ensure_ascii=False,indent=4) print ('----------Extraction of data is complete. Check json file.----------')

输出:

{'LIKES': '11,114', 'CHANNEL_NAME': 'World Rugby', 'DISLIKES': '293', 'NUMBER_OF_SUBSCRIPTIONS': '614K', 'NUMBER_OF_VIEWS': '634,395 views', 'TITLE': 'HIGHLIGHTS: Japan v Ireland - Rugby World Cup 2019', 'HASH_TAGS': ['GB', '', 'Review', '#1 on Trending', '', 'World Rugby', 'Sign in', 'Sign in', 'Sign in', 'Sign in', 'https://youtube.com/user/worldrugby', 'https://youtube.com/user/worldrugby', 'http://www.rugbyworldcup.com', 'https://twitter.com/rugbyworldcup', 'https://www.facebook.com/rugbyworldcup', 'http://www.instagram.com/rugbyworldcup', 'http://giphy.com/worldrugby', 'https://www.tiktok.com/@rugbyworldcup...', 'https://www.snapchat.com/add/rugbywor...', 'Sports', 'Extended Highlights: New Zealand v South Africa\n \n\n - Duration: 8:51.\n \nWorld Rugby\n869,064 viewsNew', '8:51', "Schmidt and Best's post match press conference| Japan v Ireland\n \n\n - Duration: 12:00.\n \nWorld Rugby\n48,365 viewsNew", '12:00', 'Liverpool players react to their FIFA 20 ratings | Van Dijk with Salah, Mane, Firmino and more\n \n\n - Duration: 5:52.\n \nLiverpool FC\n2,178,177 viewsNew', '5:52', "35th America's Cup Race 7 NZL vs. USA | AMERICA'S CUP\n \n\n - Duration: 23:23.\n \nAmerica's Cup\n152,003 views", '23:23', "Guy's maiden voyage on his hydrofoil boat | Guy Martin Proper\n \n\n - Duration: 7:09.\n \nGuy Martin Proper\n66,941 viewsNew", '7:09', "Furious Boris Johnson humiliates Jeremy Corbyn, rages at Labour's Brexit LIES and gets long APPLAUSE\n \n\n - Duration: 7:32.\n \nProductiehuisEU\n394,890 viewsNew", '7:32', "KOREA vs. BRAZIL - Highlights | Women's Volleyball World Cup 2019\n \n\n - Duration: 8:49.\n \nVolleyball World\n145,837 viewsNew", '8:49', "Jonah Lomu's 15 unforgettable Rugby World Cup tries\n \n\n - Duration: 6:00.\n \nWorld Rugby\n995,979 views", '6:00', 'Extended Highlights: France v Argentina\n \n\n - Duration: 8:35.\n \nWorld Rugby\n347,394 viewsNew', '8:35', 'What Martin Johnson did just before the 2003 World Cup final || Rugby World Cup Memories - Neil Back\n \n\n - Duration: 8:58.\n \nRugbyPass Official\n95,379 views', '8:58', "Ireland's Shock reaction to Japan Loss\n \n\n - Duration: 12:04.\n \nRugbyPass Official\n6,045 viewsNew", '12:04', 'Bodybuilder Tries Rugby, Gets SMASHED\n \n\n - Duration: 15:17.\n \nJuji & Tom\n2,138,650 views', '15:17', 'EXTENDED HIGHLIGHTS | Matchday One: Japan vs Russia\n \n\n - Duration: 23:38.\n \nWorld Rugby\n338,672 viewsNew', '23:38', 'My Story: Ruaridh McConnochie\n \n\n - Duration: 7:24.\n \nEngland Rugby\n20,312 viewsNew', '7:24', 'Japan head coach speaks after historic victory over Ireland\n \n\n - Duration: 1:21.\n \nWorld Rugby\n52,472 viewsNew', '1:21', 'HIGHLIGHTS: Argentina v Tonga - Rugby World Cup 2019\n \n\n - Duration: 2:56.\n \nWorld Rugby\n195,221 viewsNew', '2:56', 'Extended Highlights: Russia v Samoa - Rugby World Cup 2019\n \n\n - Duration: 23:11.\n \nWorld Rugby\n222,043 viewsNew', '23:11', 'Argentina vs Tonga (28-12) | Rugby World Cup 2019 Highlights\n \n\n - Duration: 3:16.\n \nITV\n16,274 viewsNew', '3:16', "Guy competes with the British America's Cup team | Guy Martin Proper\n \n\n - Duration: 9:29.\n \nGuy Martin Proper\n40,810 viewsNew", '9:29', 'Irish Rugby TV: Ireland v New Zealand 2018 GUINNESS Series Highlights\n \n\n - Duration: 7:13.\n \nIrish Rugby TV\n777,015 views', '7:13', '', 'History']} ----------Extraction of data is complete. Check json file.----------

© www.soinside.com 2019 - 2024. All rights reserved.