如何使用递归的json对象QuerySelect?

问题描述 投票:-1回答:1

我正在构建一个webscraper。 我有一个json对象的选择器,我需要通过迭代,所以我可以抓住页面上的每个值,并捕捉数据。

我如何创建一个lodash函数,以递归地走过每个属性,并执行一个标准函数来获取基于选择器的元素内部文本值?

let startingJson = {

    address:"tbody > tr > td:nth-of-type(4) > p",
    comps: [{
        address: "tbody > tr:nth-of-type(1) > td:nth-of-type(1) > p",
        link: "tbody > tr:nth-of-type(1) > td:nth-of-type(1) > p > a",
        dateLastSold: "tbody > tr:nth-of-type(1) > td:nth-of-type(2) > p",
        value: "tbody > tr:nth-of-type(1) > td:nth-of-type(3) > p"
    },
    {
        address: "tbody > tr:nth-of-type(2) > td:nth-of-type(1) > p",
        link: "tbody > tr:nth-of-type(2) > td:nth-of-type(1) > p > a",
        dateLastSold: "tbody > tr:nth-of-type(2) > td:nth-of-type(2) > p",
        value: "tbody > tr:nth-of-type(2) > td:nth-of-type(3) > p"
    }]
}


let finalJsonExample = {

    address:"123 Main Street",
    comps: [{
        address: "234 Main Street",
        link: "abc.com",
        dateLastSold: "10/20/19",
        value: "100000"
    },
    {
        address: "345 Main Street",
        link: "def.com",
        dateLastSold: "10/21/19",
        value: "110000"
    }]
}
javascript arrays lodash
1个回答
0
投票

你只需要用你的选择器递归导航那个对象,同时从页面上抓取元素,一旦你到达其中一个叶子,你就可以知道,因为那些叶子的类型是 string.

要做到这一点,你可以使用 https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/map, Array.prototype.reduce()Document.querySelector():

const startingJson = {
  address:"tbody > tr > td:nth-of-type(4) > p",
  bar: '#baz',
  comps: [{
    address: "tbody > tr:nth-of-type(1) > td:nth-of-type(1) > p",
    link: "tbody > tr:nth-of-type(1) > td:nth-of-type(1) > p > a",
    dateLastSold: "tbody > tr:nth-of-type(1) > td:nth-of-type(2) > p",
    value: "tbody > tr:nth-of-type(1) > td:nth-of-type(3) > p",
    qux: '#qux',
  }, {
    address: "tbody > tr:nth-of-type(2) > td:nth-of-type(1) > p",
    link: "tbody > tr:nth-of-type(2) > td:nth-of-type(1) > p > a",
    dateLastSold: "tbody > tr:nth-of-type(2) > td:nth-of-type(2) > p",
    value: "tbody > tr:nth-of-type(2) > td:nth-of-type(3) > p"
  }],
  list: [
    '.foo p',
    '.bar span',
  ],
};

function getElements(selectors) {
  if (typeof selectors === 'string') {
    // If `selectors` is a string, that's a selector we can use to grab an
    // element from the page. You might want to use document.querySelectorAll
    // instead, in case more than one element is returned, and handle that
    // appropriately:
    const element = document.querySelector(selectors);
    
    // Extract the `textContent` from that element, if any:
    return element ? (element.textContent || '') : null;
  }
  
  if (Array.isArray(selectors)) {
    // If we are passed an array, we need to recursively explore all the children
    // while filling in an array of the same size that we are going to return:
    return selectors.map(selector => getElements(selector));
  }
  
  if (typeof selectors === 'object') {
    // Lastly, if we are passed an object, we need to explore all its entries
    // and return an object with the same keys but with the values resolved using
    // this same function recursively:
    return Object.entries(selectors).reduce((elements, [key, value]) => {
      elements[key] = getElements(value);
      
      return elements;
    }, {}); 
  }
  
  return null;
}

console.log(getElements(startingJson));
.as-console-wrapper {
  max-height: 100% !important;
}
<div class="foo"><p>Foo<p></div>
<div class="bar"><span>Bar</span></div>
<div id="baz">Baz</div>
<div id="qux">Qux</div>

0
投票

你可以使用 lodash的 _.transform(),它既处理对象,也处理数组。当在对象上调用traverse时,通过一个函数(fn)来处理非对象值。

在这种情况下,你可以使用 Document.querySelector() 值上,然后从节点中提取文本内容。

const recursiveTransform = (fn, val) => _.transform(val, (acc, v, key) => {  
  acc[key] = _.isObject(v) ? recursiveTransform(fn, v) : fn(v);
});

const startingJson = {"address":"tbody > tr > td:nth-of-type(4) > p","bar":"#baz","comps":[{"address":"tbody > tr:nth-of-type(1) > td:nth-of-type(1) > p","link":"tbody > tr:nth-of-type(1) > td:nth-of-type(1) > p > a","dateLastSold":"tbody > tr:nth-of-type(1) > td:nth-of-type(2) > p","value":"tbody > tr:nth-of-type(1) > td:nth-of-type(3) > p","qux":"#qux"},{"address":"tbody > tr:nth-of-type(2) > td:nth-of-type(1) > p","link":"tbody > tr:nth-of-type(2) > td:nth-of-type(1) > p > a","dateLastSold":"tbody > tr:nth-of-type(2) > td:nth-of-type(2) > p","value":"tbody > tr:nth-of-type(2) > td:nth-of-type(3) > p"}],"list":[".foo p",".bar span"]};

const result = recursiveTransform(q => {
  const r = document.querySelector(q);

  return r && r.textContent;
}, startingJson);

console.log(result);
.as-console-wrapper {
  max-height: 100% !important;
}
<script src="https://cdnjs.cloudflare.com/ajax/libs/lodash.js/4.17.15/lodash.js"></script>
<div class="foo"><p>Foo<p></div>
<div class="bar"><span>Bar</span></div>
<div id="baz">Baz</div>
<div id="qux">Qux</div>
© www.soinside.com 2019 - 2024. All rights reserved.