def start_requests(self):
urls = [
'https://www.irrawaddy.com/category/business/feed'
]
for url in urls:
yield scrapy.Request(url, headers=self.headers, callback=self.parse, meta={'original_url': url})
def parse(self, response):
try:
original_url = response.meta.get('original_url')
final_url = response.url
print(f"{response.status}")
if response.status == 200:
print(f"Original URL: {original_url}, Final URL: {final_url}")
2024-03-27 14:31:29 [scrapy.core.engine] INFO: Spider opened
2024-03-27 14:31:29 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2024-03-27 14:31:29 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6030
2024-03-27 14:31:30 [scrapy.core.engine] DEBUG: Crawled (403) <GET https://www.irrawaddy.com/category/business/feed> (referer: None)
2024-03-27 14:31:30 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <403 https://www.irrawaddy.com/category/business/feed>: HTTP status code is not handled or not allowed
2024-03-27 14:31:30 [scrapy.core.engine] INFO: Closing spider (finished)
导入scrapy 从 Testspider.settings 导入 WEBSHARE_URL
类 TestSpider(scrapy.Spider): 名称 = '测试间谍' def init(自身): # self.user_agent='' self.user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, 如 Gecko) Chrome/122.0.0.0 Safari/537.36' self.is_proxy=1 self.is_robotTxt=0 def start_requests(自身): 网址 = [ 'https://www.jeuneafrique.com/pays/benin/feed/' ] 元选项 = {} 如果 self.is_proxy == 1: meta_options['代理'] = WEBSHARE_URL["WEBSHARE_URL_HTTP"] 如果 self.is_robotTxt == 0: meta_options['dont_obey_robotstxt'] = True 对于 url 中的 url: 产量 scrapy.Request(url, meta = meta_options if meta_options else None,headers = {"User-Agent": self.user_agent} if self.user_agent else None, callback=self.parse)
def parse(self, response):
try:
print(f"{response.status}")
if response.status == 200:
feed={}
items = response.xpath('//channel/item | //feed/entry')
for item in items:
feed['title'] = item.xpath('title/text()').get()
feed['pubDate'] = item.xpath('pubDate/text() | pubdate/text() | updated/text()').get()
feed['link'] = item.xpath('link/text() | link/@href').get()
except Exception as e:
print(f"Error: {str(e)}")
def spider_closed(self, spider):
print("Spider is closed.")
这是完整的代码。我应用了用户代理、代理和机器人文本,但仍然收到此错误
2024-03-28 12:34:00 [scrapy.core.engine]信息:蜘蛛打开
2024-03-28 12:34:00 [scrapy.extensions.logstats] INFO:抓取了 0 页(以 0 页/分钟),抓取了 0 项(以 0 项/分钟)
2024-03-28 12:34:00 [scrapy.extensions.telnet] INFO:Telnet 控制台监听 127.0.0.1:6027
2024-03-28 12:34:01 [scrapy.core.engine] 调试:已爬网 (403)