这是我的蜘蛛文件。在提取文章之前,我想检查一下我提供的关键字之一是否与文章链接的meta关键字匹配。
因此,这段代码似乎在爬网,但是它绝对不会报废任何内容,因为它不会返回任何项目。
import urlparse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from projetpfe.items import ProjetpfeItem
class ProjetpfeSpider(CrawlSpider):
name = 'lavieecokw'
allowed_domains = ["lavieeco.com"]
start_urls = ['http://www.lavieeco.com/news/economie/barid-al-maghrib-un-dialogue-social-laborieux--32385.html'] # urls from which the spider will start crawling
rules = [Rule(SgmlLinkExtractor(allow=[r'news/\w+/\w+']), callback='parse_lavieeco', follow=True)]
# r'\d{4}/\d{2}/\w+' : regular expression for http://telquel.ma/YYYY/MM/title URLs
def parse_lavieeco(self, response):
items = []
keywords = ['barid al maghrib', 'barid al-maghrib', 'postale', 'timbre-poste', 'timbre' , 'al amana', 'poste maroc', 'barid bank', 'banque postale']
metakeywords = response.xpath('//meta[@name="keywords"]').extract()
if metakeywords != []:
if any(key in metakeywords for key in keywords):
for link in response.xpath("//a"):
item = ProjetpfeItem()
item['Titre'] = link.xpath('text()').extract()
item['LienImage'] = link.xpath('text()').extract()
item['OrganePresse'] = link.xpath('/html/head/title').extract()
item['Jour'] = link.xpath('text()').extract()
item['Contenu'] = link.xpath('text()').extract()
item['Journaliste'] = link.xpath('text()').extract()
item['URL'] = link.xpath('@href').extract()
item['Categorie'] = link.xpath('text()').extract()
item['Annee'] = link.xpath('text()').extract()
outbound = str(link.xpath('@href').extract())
if 'http' in outbound:
items.append(item)
return items
您可以尝试这样的事情,
title = response.xpath('//title/text()').extract()
title = title[0].strip().lower() if title else ''
metakeywords = [title for _key in keywords if _key in title]
可疑外壳中的演示,
In [1]: keywords = ['barid al maghrib', 'barid al-maghrib', 'postale', 'timbre-poste', 'timbre' , 'al amana', 'poste maroc', 'barid bank', 'banque postale']
In [2]: title = response.xpath('//title/text()').extract()
In [3]: title = title[0].strip().lower() if title else ''
In [4]: metakeywords = [title for _key in keywords if _key in title]
In [5]: metakeywords
Out[5]: [u'barid al maghrib : un dialogue social laborieux\u2026']
编辑
因此完整的代码将是
def parse_lavieeco(self, response):
keywords = ['barid al maghrib', 'barid al-maghrib', 'postale', 'timbre-poste', 'timbre' , 'al amana', 'poste maroc', 'barid bank', 'banque postale']
title = response.xpath('//title/text()').extract()
if title:
title = title[0].strip().lower()
metakeywords = [title for _key in keywords if _key in title]
if metakeywords:
# rest of your code if metakeywords contains title goes here
我正在努力抓取一个网站作为新闻标题。我只想在标题中写上特定的单词。我尝试了您在那里所做的事情,但是由于某种原因我无法获得任何结果,您知道为什么吗?
我将在下面发布我的代码:
def parse(self, response):
keywords = ['Coronavirus', 'Headphones']
headlines = response.css('.px-2 .post-url::text').extract()
if headlines:
headlines = headlines[0].strip().lower()
metakeywords = [headlines for _key in keywords if _key in headlines]
if metakeywords:
yield {'headlinestext' : headlines}