在脚本标记和隔离元素中刮擦数据

问题描述 投票:0回答:1

我在script标记内抓取数据时遇到了一些问题。我想在jsonSwatchConfig []中获取数据,尤其是每个'option_id'和'us'属性。

var jar = require('request').jar();
var request = require('request').defaults({ jar: jar });
var cheerio = require("cheerio");
var cloudscraper = require('cloudscraper');
var fs = require('fs');
const logT = require('log-timestamp');
var open = require('open');
var today = new Date();
var randomsize;
var webdriver = require('selenium-webdriver');
var request = require("request")
var cheerio = require("cheerio")

var sizes =[]

function product() {
    console.log("Obtaining Product Page..")
    request.get({
        url: 'https://www.sivasdescalzo.com/en/nike-air-max-270-react-eng-cd0113-400',
        jar: request.jar(),
        headers: {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'it-IT,it;q=0.9,en-US;q=0.8,en;q=0.7,pt;q=0.6,fr;q=0.5,de;q=0.4',
        'cache-control': 'max-age=0',
        'dnt': '1',
        'referer': 'https://www.sivasdescalzo.com/en/lifestyle',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'},
         cloudflareTimeout: 5000,
         cloudflareMaxTimeout: 30000,
         followAllRedirects: true,
         //proxy: getproxy(),
         challengesToSolve: 3,
         json: true,
         decodeEmails: false,
         gzip: true,
    }, function(error, response, body) {
        var $ = cheerio.load(body);
        var script = $('body > script').map((i, x) => x.children[0])
                                 .filter((i, x) => x && 
        x.data.match(/jsonSwatchConfig/)).get(0);
        if (script){
            var scriptText = script.data.replace(/\r?\n|\r/g, "")
            console.log(scriptText.split('"Svd_Catalog/js/svd.swatch.renderer"')[1].replace(" ",""));
        }
    });
}

我的实际输出是:

    :{                "jsonSwatchConfig": 
[{"option_id":"18","us":"7","eu":"40","uk":"6","cm":"25"}, 
{"option_id":"16","us":"8","eu":"41","uk":"7","cm":"26"}, 
{"option_id":"14","us":"9","eu":"42.5","uk":"8","cm":"27"}, 
{"option_id":"12","us":"10","eu":"44","uk":"9","cm":"28"}, 
{"option_id":"10","us":"11","eu":"45","uk":"10","cm":"29"}, 
{"option_id":"8","us":"12","eu":"46","uk":"11","cm":"30"}, 
{"option_id":"105","us":"10.5","eu":"44.5","uk":"9.5","cm":"28.5"}, 
{"option_id":"117","us":"11.5","eu":"45.5","uk":"10.5","cm":"29.5"}, 
{"option_id":"110","us":"7.5","eu":"40.5","uk":"6.5","cm":"25.5"}, 
{"option_id":"121","us":"8.5","eu":"42","uk":"7.5","cm":"26.5"}, 
{"option_id":"114","us":"9.5","eu":"43","uk":"8.5","cm":"27.5"}] }              },        
"*" : {            "Magento_Swatches/js/catalog-add-to-cart": {}        }    }

我尝试了很多次,但没有找到获得想要的东西的方法。用过的拆分和删除。

这是html脚本标记源:

    <script type="text/x-magento-init">
{
    "[data-role=swatch-options]": {
        "Magento_Swatches/js/swatch-renderer": {
            "jsonConfig": {"attributes":{"138":{"id":"138","code":"size_us","label":"Size","options":[{"id":"24","label":"4","products":[]},{"id":"172","label":"4.5","products":[]},{"id":"22","label":"5","products":[]},{"id":"160","label":"5.5","products":[]},{"id":"20","label":"6","products":[]},{"id":"165","label":"6.5","products":[]},{"id":"18","label":"7","products":["276814"]},{"id":"110","label":"7.5","products":[]},{"id":"16","label":"8","products":[]},{"id":"121","label":"8.5","products":[]},{"id":"14","label":"9","products":[]},{"id":"114","label":"9.5","products":["242798"]},{"id":"12","label":"10","products":["242796"]},{"id":"105","label":"10.5","products":["242793"]},{"id":"10","label":"11","products":["242794"]},{"id":"117","label":"11.5","products":["276810"]},{"id":"8","label":"12","products":[]}],"position":"0"}},"template":"\u20ac<%- data.price %>","currencyFormat":"\u20ac%s","optionPrices":{"242796":{"oldPrice":{"amount":90},"basePrice":{"amount":90},"finalPrice":{"amount":90},"tierPrices":[],"msrpPrice":{"amount":null}},"242793":{"oldPrice":{"amount":90},"basePrice":{"amount":90},"finalPrice":{"amount":90},"tierPrices":[],"msrpPrice":{"amount":null}},"242794":{"oldPrice":{"amount":90},"basePrice":{"amount":90},"finalPrice":{"amount":90},"tierPrices":[],"msrpPrice":{"amount":null}},"242795":{"oldPrice":{"amount":90},"basePrice":{"amount":90},"finalPrice":{"amount":90},"tierPrices":[],"msrpPrice":{"amount":null}},"242792":{"oldPrice":{"amount":90},"basePrice":{"amount":90},"finalPrice":{"amount":90},"tierPrices":[],"msrpPrice":{"amount":null}},"242797":{"oldPrice":{"amount":90},"basePrice":{"amount":90},"finalPrice":{"amount":90},"tierPrices":[],"msrpPrice":{"amount":null}},"242798":{"oldPrice":{"amount":90},"basePrice":{"amount":90},"finalPrice":{"amount":90},"tierPrices":[],"msrpPrice":{"amount":null}},"276810":{"oldPrice":{"amount":90},"basePrice":{"amount":90},"finalPrice":{"amount":90},"tierPrices":[],"msrpPrice":{"amount":null}},"276815":{"oldPrice":{"amount":90},"basePrice":{"amount":90},"finalPrice":{"amount":90},"tierPrices":[],"msrpPrice":{"amount":null}},"276809":{"oldPrice":{"amount":90},"basePrice":{"amount":90},"finalPrice":{"amount":90},"tierPrices":[],"msrpPrice":{"amount":null}},"276811":{"oldPrice":{"amount":90},"basePrice":{"amount":90},"finalPrice":{"amount":90},"tierPrices":[],"msrpPrice":{"amount":null}},"276816":{"oldPrice":{"amount":90},"basePrice":{"amount":90},"finalPrice":{"amount":90},"tierPrices":[],"msrpPrice":{"amount":null}},"276812":{"oldPrice":{"amount":90},"basePrice":{"amount":90},"finalPrice":{"amount":90},"tierPrices":[],"msrpPrice":{"amount":null}},"276813":{"oldPrice":{"amount":90},"basePrice":{"amount":90},"finalPrice":{"amount":90},"tierPrices":[],"msrpPrice":{"amount":null}},"276814":{"oldPrice":{"amount":90},"basePrice":{"amount":90},"finalPrice":{"amount":90},"tierPrices":[],"msrpPrice":{"amount":null}},"276817":{"oldPrice":{"amount":90},"basePrice":{"amount":90},"finalPrice":{"amount":90},"tierPrices":[],"msrpPrice":{"amount":null}},"310604":{"oldPrice":{"amount":90},"basePrice":{"amount":90},"finalPrice":{"amount":90},"tierPrices":[],"msrpPrice":{"amount":null}}},"priceFormat":{"pattern":"\u20ac%s","precision":2,"requiredPrecision":2,"decimalSymbol":".","groupSymbol":",","groupLength":3,"integerRequired":false},"prices":{"oldPrice":{"amount":90},"basePrice":{"amount":90},"finalPrice":{"amount":90}},"productId":"242799","chooseText":"Choose an Option...","images":[],"index":{"242796":{"138":"12"},"242793":{"138":"105"},"242794":{"138":"10"},"242795":{"138":"16"},"242792":{"138":"121"},"242797":{"138":"14"},"242798":{"138":"114"},"276810":{"138":"117"},"276815":{"138":"24"},"276809":{"138":"172"},"276811":{"138":"22"},"276816":{"138":"160"},"276812":{"138":"20"},"276813":{"138":"165"},"276814":{"138":"18"},"276817":{"138":"110"},"310604":{"138":"8"}}},
            "jsonSwatchConfig": {"138":{"24":{"type":"0","value":"4","label":"4"},"172":{"type":"0","value":"4.5","label":"4.5"},"22":{"type":"0","value":"5","label":"5"},"160":{"type":"0","value":"5.5","label":"5.5"},"20":{"type":"0","value":"6","label":"6"},"165":{"type":"0","value":"6.5","label":"6.5"},"18":{"type":"0","value":"7","label":"7"},"110":{"type":"0","value":"7.5","label":"7.5"},"16":{"type":"0","value":"8","label":"8"},"121":{"type":"0","value":"8.5","label":"8.5"},"14":{"type":"0","value":"9","label":"9"},"114":{"type":"0","value":"9.5","label":"9.5"},"12":{"type":"0","value":"10","label":"10"},"105":{"type":"0","value":"10.5","label":"10.5"},"10":{"type":"0","value":"11","label":"11"},"117":{"type":"0","value":"11.5","label":"11.5"},"8":{"type":"0","value":"12","label":"12"},"additional_data":"{\"swatch_input_type\":\"text\",\"update_product_preview_image\":\"0\",\"use_product_image_for_swatch\":0}"}},
            "mediaCallback": "https://www.sivasdescalzo.com/en/swatches/ajax/media/",
            "gallerySwitchStrategy": "replace",
            "jsonSwatchImageSizeConfig": {"swatchImage":{"width":30,"height":20},"swatchThumb":{"height":90,"width":110}} },
        "Svd_Catalog/js/svd.swatch.renderer": {
            "jsonSwatchConfig": [{"option_id":"24","us":"4","eu":"36","uk":"3","cm":"23"},{"option_id":"22","us":"5","eu":"37.5","uk":"4.5","cm":"23.5"},{"option_id":"20","us":"6","eu":"38.5","uk":"5.5","cm":"24"},{"option_id":"18","us":"7","eu":"40","uk":"6","cm":"25"},{"option_id":"16","us":"8","eu":"41","uk":"7","cm":"26"},{"option_id":"14","us":"9","eu":"42.5","uk":"8","cm":"27"},{"option_id":"12","us":"10","eu":"44","uk":"9","cm":"28"},{"option_id":"10","us":"11","eu":"45","uk":"10","cm":"29"},{"option_id":"8","us":"12","eu":"46","uk":"11","cm":"30"},{"option_id":"105","us":"10.5","eu":"44.5","uk":"9.5","cm":"28.5"},{"option_id":"117","us":"11.5","eu":"45.5","uk":"10.5","cm":"29.5"},{"option_id":"172","us":"4.5","eu":"36.5","uk":"4","cm":"23.5"},{"option_id":"160","us":"5.5","eu":"38","uk":"5","cm":"24"},{"option_id":"165","us":"6.5","eu":"39","uk":"6","cm":"24.5"},{"option_id":"110","us":"7.5","eu":"40.5","uk":"6.5","cm":"25.5"},{"option_id":"121","us":"8.5","eu":"42","uk":"7.5","cm":"26.5"},{"option_id":"114","us":"9.5","eu":"43","uk":"8.5","cm":"27.5"}] }      
    },
    "*" : {
        "Magento_Swatches/js/catalog-add-to-cart": {}
    }
}</script>

我想知道如何做以及最安全的方法。我可以继续拆分和删除,但我认为这不是执行此类任务的正确方法。

javascript json web-scraping cheerio
1个回答
0
投票

如何使用RegExp

var pattern = /"jsonSwatchConfig": (\[.*\])/;
var results = scriptText.match(pattern)[1];

编辑:要提取option_idus属性,首先必须将其从字符串转换为对象。上面的代码为您提供数组jsonSwatchConfig作为字符串,而不是作为实际的对象数组,因此您可以执行此操作(从上面的代码继续):results = JSON.parse(results)现在将把数据作为对象数组提供给您。要访问特定属性,您可以执行以下操作:

var option_id1 = results[0].option_id;
var us1 = results[0].us;
...and so on as needed
© www.soinside.com 2019 - 2024. All rights reserved.