我正在构建一个webscraper。 我有一个json对象的选择器,我需要通过迭代,所以我可以抓住页面上的每个值,并捕捉数据。
我如何创建一个lodash函数,以递归地走过每个属性,并执行一个标准函数来获取基于选择器的元素内部文本值?
let startingJson = {
address:"tbody > tr > td:nth-of-type(4) > p",
comps: [{
address: "tbody > tr:nth-of-type(1) > td:nth-of-type(1) > p",
link: "tbody > tr:nth-of-type(1) > td:nth-of-type(1) > p > a",
dateLastSold: "tbody > tr:nth-of-type(1) > td:nth-of-type(2) > p",
value: "tbody > tr:nth-of-type(1) > td:nth-of-type(3) > p"
},
{
address: "tbody > tr:nth-of-type(2) > td:nth-of-type(1) > p",
link: "tbody > tr:nth-of-type(2) > td:nth-of-type(1) > p > a",
dateLastSold: "tbody > tr:nth-of-type(2) > td:nth-of-type(2) > p",
value: "tbody > tr:nth-of-type(2) > td:nth-of-type(3) > p"
}]
}
let finalJsonExample = {
address:"123 Main Street",
comps: [{
address: "234 Main Street",
link: "abc.com",
dateLastSold: "10/20/19",
value: "100000"
},
{
address: "345 Main Street",
link: "def.com",
dateLastSold: "10/21/19",
value: "110000"
}]
}
你只需要用你的选择器递归导航那个对象,同时从页面上抓取元素,一旦你到达其中一个叶子,你就可以知道,因为那些叶子的类型是 string
.
要做到这一点,你可以使用 https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array/map
, Array.prototype.reduce()
和 Document.querySelector()
:
const startingJson = {
address:"tbody > tr > td:nth-of-type(4) > p",
bar: '#baz',
comps: [{
address: "tbody > tr:nth-of-type(1) > td:nth-of-type(1) > p",
link: "tbody > tr:nth-of-type(1) > td:nth-of-type(1) > p > a",
dateLastSold: "tbody > tr:nth-of-type(1) > td:nth-of-type(2) > p",
value: "tbody > tr:nth-of-type(1) > td:nth-of-type(3) > p",
qux: '#qux',
}, {
address: "tbody > tr:nth-of-type(2) > td:nth-of-type(1) > p",
link: "tbody > tr:nth-of-type(2) > td:nth-of-type(1) > p > a",
dateLastSold: "tbody > tr:nth-of-type(2) > td:nth-of-type(2) > p",
value: "tbody > tr:nth-of-type(2) > td:nth-of-type(3) > p"
}],
list: [
'.foo p',
'.bar span',
],
};
function getElements(selectors) {
if (typeof selectors === 'string') {
// If `selectors` is a string, that's a selector we can use to grab an
// element from the page. You might want to use document.querySelectorAll
// instead, in case more than one element is returned, and handle that
// appropriately:
const element = document.querySelector(selectors);
// Extract the `textContent` from that element, if any:
return element ? (element.textContent || '') : null;
}
if (Array.isArray(selectors)) {
// If we are passed an array, we need to recursively explore all the children
// while filling in an array of the same size that we are going to return:
return selectors.map(selector => getElements(selector));
}
if (typeof selectors === 'object') {
// Lastly, if we are passed an object, we need to explore all its entries
// and return an object with the same keys but with the values resolved using
// this same function recursively:
return Object.entries(selectors).reduce((elements, [key, value]) => {
elements[key] = getElements(value);
return elements;
}, {});
}
return null;
}
console.log(getElements(startingJson));
.as-console-wrapper {
max-height: 100% !important;
}
<div class="foo"><p>Foo<p></div>
<div class="bar"><span>Bar</span></div>
<div id="baz">Baz</div>
<div id="qux">Qux</div>
你可以使用 lodash的 _.transform()
,它既处理对象,也处理数组。当在对象上调用traverse时,通过一个函数(fn
)来处理非对象值。
在这种情况下,你可以使用 Document.querySelector()
值上,然后从节点中提取文本内容。
const recursiveTransform = (fn, val) => _.transform(val, (acc, v, key) => {
acc[key] = _.isObject(v) ? recursiveTransform(fn, v) : fn(v);
});
const startingJson = {"address":"tbody > tr > td:nth-of-type(4) > p","bar":"#baz","comps":[{"address":"tbody > tr:nth-of-type(1) > td:nth-of-type(1) > p","link":"tbody > tr:nth-of-type(1) > td:nth-of-type(1) > p > a","dateLastSold":"tbody > tr:nth-of-type(1) > td:nth-of-type(2) > p","value":"tbody > tr:nth-of-type(1) > td:nth-of-type(3) > p","qux":"#qux"},{"address":"tbody > tr:nth-of-type(2) > td:nth-of-type(1) > p","link":"tbody > tr:nth-of-type(2) > td:nth-of-type(1) > p > a","dateLastSold":"tbody > tr:nth-of-type(2) > td:nth-of-type(2) > p","value":"tbody > tr:nth-of-type(2) > td:nth-of-type(3) > p"}],"list":[".foo p",".bar span"]};
const result = recursiveTransform(q => {
const r = document.querySelector(q);
return r && r.textContent;
}, startingJson);
console.log(result);
.as-console-wrapper {
max-height: 100% !important;
}
<script src="https://cdnjs.cloudflare.com/ajax/libs/lodash.js/4.17.15/lodash.js"></script>
<div class="foo"><p>Foo<p></div>
<div class="bar"><span>Bar</span></div>
<div id="baz">Baz</div>
<div id="qux">Qux</div>