谁能帮我弄一下我的选择器。我想只从一个论坛网站获取用户名和帖子内容。当我使用 Content = response.xpath('//html//p//text()').getall()
我从网站上得到所有的文本(我知道是否有任何论坛帖子),当我试图使它更具体的时候 Content = response.xpath('//html//div[contains(id="post_message"//text()').getall()
我得到错误信息和一个空文件。
我唯一能想到的是我的选择器是错误的,或者我需要登录。我对 "登录 "的webscraper没有概念。我已经把我的代码和一些检查的例子放在一起了。
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy import Selector
from scrapy.linkextractors import LinkExtractor
Websites = ["https://www.fodors.com/community/trending.php"]
class spider(scrapy.Spider):
name = "spider"
start_urls = Websites
user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"
D = {}
links = []
# links = [item for sublist in links for item in sublist]
def start_requests(self):
for W in self.start_urls:
self.D[W] = []
yield scrapy.Request(W, callback = self.ParseLinks)
def ParseLinks(self, response):
Link = response.xpath(".//a//@href").extract()
self.links.append(Link)
for W in self.D:
self.D[W] = Link
# self.links = [item for sublist in self.links for item in sublist]
for List in self.links:
for L in List:
yield response.follow(L, callback=self.ParseContent)
def ParseContent(self, response):
# Content = response.xpath('//html//p//text()').extract()
# Content = response.xpath('//html//p//text()').get().extract()
# Content = ''.join(sel.select("//body//text()").extract()).strip()
# Content = response.xpath('//html//p[contains(@class,"heading")]//text()').extract()
# Content = response.xpath('//body/node()/text()').extract_first()
# Content = response.xpath('//html//p//text()').getall()
# Content = response.xpath('//html//div[contains(@id="post_message")]/text()').extract()
User = response.xpath('//html//div/a[contains(@class="bigusername")]/text()').get()
Content = response.xpath('//html//div[contains(id="post_message"//text()').getall()
FinalText = yield {User: Content}
# print(Content)
print(FinalText)
#full xpath copy and paste for each
#/html/body/div[4]/div/div[8]/div[1]/div[5]/div[1]/div[1]/div/div/div[2]/div[2]/div[2]/a
#/html/body/div[4]/div/div[8]/div[1]/div[5]/div[1]/div[1]/div/div/div[2]/div[2]/div[2]
#element <a rel="nofollow" class="bigusername" href="/community/profile/NAME/">NAME</a>
#text <div id="post_message_NUMBER">
# text text text
# </div>
process = CrawlerProcess()
process.crawl(spider)
process.start()
你的XPath语法有误。用这个来代替。
Content = response.xpath('//html//div[contains(id,"post_message")]//text()').getall()
也许添加一个 @
在...之前 id
如果它是一个属性。
如果对别人有帮助,这里是我最后的xpath。
XS = response.xpath('//html//div[contains(@id, "post_message")]')
User = XS.xpath('//a[contains(@href, "community/profile")]//text()').getall()
Content = XS.xpath('//b//text()').getall()