我的目标是从这个网站获取数据:https://pokemondb.net/pokedex/all
我正在努力获得能力,他们一定是这样的:
网址 姓名 效果说明
但是其中一些信息在另一个页面中。关于如何获得它们有什么建议吗?
我需要访问每个技能的链接并收集信息:
我的代码现在看起来像这样:
import scrapy
class PokeSpider(scrapy.Spider):
name = 'pokespider'
start_urls = ['https://pokemondb.net/pokedex/all']
def parse(self, response):
linha = response.css('table#pokedex > tbody > tr:first-child')
link = linha.css("td:nth-child(2) > a::attr(href)")
yield response.follow(link.get(), self.parser_pokemon)
def parser_pokemon(self, response):
nome = response.css('h1::text').get()
id = response.css('table.vitals-table > tbody > tr:nth-child(1) > td > strong::text').get()
tamanho = response.css('table.vitals-table > tbody > tr:nth-child(4) > td::text').get()
peso = response.css('table.vitals-table > tbody > tr:nth-child(5) > td::text').get()
url_pokemon = response.url
tipos = response.css('table.vitals-table tbody tr:nth-child(2) td a::text').getall()[:2]
evolucoes = []
evolucoes_possiveis = response.css('#main div.infocard-list-evo div span.infocard-lg-data.text-muted')
for evolucao in evolucoes_possiveis:
nome_evolucao = evolucao.css('a::text').get()
id_evolucao = evolucao.css('small:nth-child(1)::text').get()
url_evolucao = evolucao.css('a::attr(href)').get()
url_evolucao_completinha = f'https://pokemondb.net{url_evolucao}'
evolucoes.append({
"nome_evolucao": nome_evolucao,
"id_evolucao": id_evolucao,
"url_evolucao": url_evolucao_completinha
})
yield {
"nome": nome,
"id": id,
"tamanho": tamanho,
"peso": peso,
"url_pokemon": url_pokemon,
"tipos": tipos,
"evolucoes": evolucoes,
}
我建议您阅读有关 cb_kwargs 的文档 https://docs.scrapy.org/en/latest/topics/debug.html?highlight=cb_kwargs 和 scrapy 项目 https://docs.scrapy.org/en/latest/topics/items.html
您可以发出下一个请求,并通过像这样的元参数将信息传递给下一个函数
def parser_pokemon(self, response):
nome = response.css('h1::text').get()
id = response.css('table.vitals-table > tbody > tr:nth-child(1) > td > strong::text').get()
tamanho = response.css('table.vitals-table > tbody > tr:nth-child(4) > td::text').get()
peso = response.css('table.vitals-table > tbody > tr:nth-child(5) > td::text').get()
url_pokemon = response.url
tipos = response.css('table.vitals-table tbody tr:nth-child(2) td a::text').getall()[:2]
evolucoes = []
evolucoes_possiveis = response.css('#main div.infocard-list-evo div span.infocard-lg-data.text-muted')
for evolucao in evolucoes_possiveis:
nome_evolucao = evolucao.css('a::text').get()
id_evolucao = evolucao.css('small:nth-child(1)::text').get()
url_evolucao = evolucao.css('a::attr(href)').get()
url_evolucao_completinha = f'https://pokemondb.net{url_evolucao}'
evolucoes.append(
{
"nome_evolucao": nome_evolucao,
"id_evolucao": id_evolucao,
"url_evolucao": url_evolucao_completinha
}
)
# VVVVVVVVVVVV next code is updated VVVVVVVVVVVV
yield Request(
url='https://example.com/next_page_path',
callback=self.parse_attributes,
meta={
'pokemon_attribs': {
"nome": nome,
"id": id,
"tamanho": tamanho,
"peso": peso,
"url_pokemon": url_pokemon,
"tipos": tipos,
"evolucoes": evolucoes,
},
},
)
def parse_attributes(self, response):
pokemon_attribs = response.meta['pokemon_attribs']
pokemon_lastname = response.css('a::text').get()
pokemon_attribs.update({'pokemon_lastname': pokemon_lastname})
yield pokemon_attribs