我想用nodejs来抓取一个维基百科的页面。

问题描述 投票:0回答:1

我正试图搜刮一个维基百科的页面。https:/en.wikipedia.orgwikiList_of_largest_company_by_revenue。. 我的目标是将表中所有50行存储到csv文件中。

但我不能这样做。请找到所附的代码。

// Importing necessary modules

const request = require("request-promise")
const cheerio = require("cheerio")
const fs      = require("fs")
const json2csv = require("json2csv").Parser

// page which i want to scrape
const wiki = "https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue";

(async () => {

    // this will store the data
    let data = [];

    // making request
    const response = await request({
        uri: wiki,
        headers: { 
                        accept:
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "accept-encoding": "gzip, deflate, br",
            "accept-language": "en-US,en;q=0.9"
        },
        gzip: true,
    });

    let $ = cheerio.load(response);


      // Extracting rank and name of the company, it will be only for 1st row
      // I will run a loop 50 times to get 50 rows. But below two lines are not working

     // on testing this line on chrome console, it is giving correct output , but it is not working   
     // here, same thing for name
        let rank = $('table[class="wikitable sortable jquery-tablesorter"] tbody tr:nth-child(1) th').text();

        let name = $('table[class="wikitable sortable jquery-tablesorter"] tbody tr:nth-child(1) td:nth-child(2)').text();

     // saving data
        data.push({
            rank,
            name,
        });

    // exporting to csv
    const j2cp = new json2csv()
    const csv = j2cp.parse(data);

    fs.writeFileSync("./imdb1.csv", csv, "utf-8");
}
)();


请让我知道哪里是错误的。我按照这个教程,如果你想知道一些代码。https:/www.youtube.comwatch?v=BqGq9MTSt7g任何帮助都是感激的。谢谢你的帮助。

javascript node.js web-scraping wikipedia cheerio
1个回答
0
投票

我尝试了你的代码,似乎使用的jquery选择器是好的,但不知为何cheerio返回null。这是我修改后的代码,更新后也能得到其他列。

    var req = require("request-promise"), cheerio = require("cheerio"), fs = require("fs"), json2csv = require("json2csv").Parser; / page which I want to scrape const wiki = "https:/en. wikipedia.orgwikiList_of_largest_companies_by_revenue"; (async () => { const response = await req({ uri: wiki, headers: { accept:                    "texthtml,applicationxhtml+xml,applicationxml;q=0.9,imageewebp,imageapng,**;q=0.8,applicationigned-exchange;v=b3;q=0.9", "accept-encoding": "gzip, deflate, br", "accept-language": "en-US,en;q=0.9"            }, gzip: true, }).then(function (html) { let $ = cheerio.load(html); let data = []; let data2 = []; let name, rank, cols, col; let rows = $('table. wikitable tbody tr').each(((idx, elem)=> { rank =$(elem).find('th').text().replace([\n\r]+g,''); /name =$(elem).find('td a').html(); data2 = []; cols =$(elem). find('td').each((colidx, colelem)=> { col = $(colelem).text().replace([\n\r]+g,''); data2.push(col,); }); data.push({ rank, ... data2, }); }); /导出为csv const j2cp = new json2csv() const csv = j2cp.parse(data); fs.writeFileSync(".imdb1.csv", csv, "utf-8"); }).catch(function (err) { console.log(err); }); }。   )();
© www.soinside.com 2019 - 2024. All rights reserved.