使用feedparser检测重复标题

问题描述 投票:0回答:1
def parseRSS(rss_url):
    parsed_feed = feedparser.parse(rss_url)
    return parsed_feed

def getHeadlines(rss_url,key):
    headlines = []
    feed = parseRSS(rss_url)
    for newsitem in feed['items']:
        if newsitem['title'] not in headlines:
            headlines.append([newsitem,key])
        else:
            print("-----------------------Duplicate title found----------------------")
    return headlines

def get_rss():
    allheadlines = []
    newsurls = {
    ('key1','source1'): 'https://news.google.com/news/rss/?hl=en&ned=us&gl=US',
    ('key2','source2'): 'https://news.google.com/news/rss/?hl=en&ned=us&gl=US',
    }
    for key,url in newsurls.items():
        allheadlines.extend(getHeadlines(url,key))

    return allheadlines

allheadlines = get_rss()

for hl in allheadlines:
    source = hl[1][0]
    key = hl[1][1]
    title = hl[0]['title']
    link = hl[0]['link']

我使用相同的rss feed进行测试。每次添加新标题时,我都会检查标题是否已经在标题中。但是,它似乎没有检测到重复的标题。找到的重复标题永远不会被打印。我究竟做错了什么?

python feedparser
1个回答
1
投票
尝试一下

def parseRSS(rss_url): parsed_feed = feedparser.parse(rss_url) return parsed_feed def getHeadlines(rss_url,key,allheadlines,allitems): feed = parseRSS(rss_url) for newsitem in feed['items']: if newsitem['title'] not in allheadlines: allheadlines.append(newsitem['title']) allitems.append([newsitem,key]) else: print("-----------------------Duplicate title found----------------------") return allheadlines,allitems def get_rss(): allheadlines = [] allitems = [] newsurls = { ('key1','source1'): 'https://news.google.com/news/rss/?hl=en&ned=us&gl=US', ('key2','source2'): 'https://news.google.com/news/rss/?hl=en&ned=us&gl=US', } for key,url in newsurls.items(): allheadlines,allitems=(getHeadlines(url,key,allheadlines,allitems)) return allitems allheadlines = get_rss()

© www.soinside.com 2019 - 2024. All rights reserved.