从亚马逊产品页面获取图片的网址

问题描述 投票:0回答:1

我想从亚马逊的产品形象凑的网址,例如,qazxsw POI。

在网页源代码,有其中包含不同尺寸(大,中,hirez等)的图像的所有的URL的部分。我可以通过执行拿到剧本的那一部分,scrapy,

this link

这给了我一个字符串,它看起来像这样,

imagesString = (response.xpath('//script[contains(., "ImageBlockATF")]/text()').extract_first())

我的目标是进入一个JSON字典里面P.when('A').register("ImageBlockATF", function(A){ var data = { 'colorImages': { 'initial': [{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/81FED1p-sTL._SL1500_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/31HoKqtljqL._SS40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/31HoKqtljqL.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/81FED1p-sTL._SX355_.jpg":[308,355],"https://images-na.ssl-images-amazon.com/images/I/81FED1p-sTL._SX450_.jpg":[390,450],"https://images-na.ssl-images-amazon.com/images/I/81FED1p-sTL._SX425_.jpg":[369,425],"https://images-na.ssl-images-amazon.com/images/I/81FED1p-sTL._SX466_.jpg":[404,466],"https://images-na.ssl-images-amazon.com/images/I/81FED1p-sTL._SX522_.jpg":[453,522],"https://images-na.ssl-images-amazon.com/images/I/81FED1p-sTL._SX569_.jpg":[494,569],"https://images-na.ssl-images-amazon.com/images/I/81FED1p-sTL._SX679_.jpg":[589,679]},"variant":"MAIN","lowRes":null},{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/81e8905DlhL._SL1500_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/31Y%2B8oE5DtL._SS40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/31Y%2B8oE5DtL.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/81e8905DlhL._SX355_.jpg":[308,355],"https://images-na.ssl-images-amazon.com/images/I/81e8905DlhL._SX450_.jpg":[390,450],"https://images-na.ssl-images-amazon.com/images/I/81e8905DlhL._SX425_.jpg":[369,425],"https://images-na.ssl-images-amazon.com/images/I/81e8905DlhL._SX466_.jpg":[404,466],"https://images-na.ssl-images-amazon.com/images/I/81e8905DlhL._SX522_.jpg":[453,522],"https://images-na.ssl-images-amazon.com/images/I/81e8905DlhL._SX569_.jpg":[494,569],"https://images-na.ssl-images-amazon.com/images/I/81e8905DlhL._SX679_.jpg":[589,679]},"variant":"PT01","lowRes":null},{"hiRes":null,"thumb":"https://images-na.ssl-images-amazon.com/images/I/51rORrvh0hL._SS40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/51rORrvh0hL.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/51rORrvh0hL._SX355_.jpg":[236,355],"https://images-na.ssl-images-amazon.com/images/I/51rORrvh0hL._SX450_.jpg":[300,450],"https://images-na.ssl-images-amazon.com/images/I/51rORrvh0hL._SX425_.jpg":[283,425],"https://images-na.ssl-images-amazon.com/images/I/51rORrvh0hL._SX466_.jpg":[310,466],"https://images-na.ssl-images-amazon.com/images/I/51rORrvh0hL.jpg":[333,500]},"variant":"PT02","lowRes":null},{"hiRes":null,"thumb":"https://images-na.ssl-images-amazon.com/images/I/41L2OU5rPyL._SS40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/41L2OU5rPyL.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/41L2OU5rPyL._SX355_.jpg":[236,355],"https://images-na.ssl-images-amazon.com/images/I/41L2OU5rPyL._SX450_.jpg":[300,450],"https://images-na.ssl-images-amazon.com/images/I/41L2OU5rPyL._SX425_.jpg":[283,425],"https://images-na.ssl-images-amazon.com/images/I/41L2OU5rPyL._SX466_.jpg":[310,466],"https://images-na.ssl-images-amazon.com/images/I/41L2OU5rPyL.jpg":[333,500]},"variant":"PT03","lowRes":null},{"hiRes":null,"thumb":"https://images-na.ssl-images-amazon.com/images/I/51%2BsCYjx6OL._SS40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/51%2BsCYjx6OL.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/51%2BsCYjx6OL._SX355_.jpg":[236,355],"https://images-na.ssl-images-amazon.com/images/I/51%2BsCYjx6OL._SX450_.jpg":[300,450],"https://images-na.ssl-images-amazon.com/images/I/51%2BsCYjx6OL._SX425_.jpg":[283,425],"https://images-na.ssl-images-amazon.com/images/I/51%2BsCYjx6OL._SX466_.jpg":[310,466],"https://images-na.ssl-images-amazon.com/images/I/51%2BsCYjx6OL.jpg":[333,500]},"variant":"PT04","lowRes":null}]}, 'colorToAsin': {'initial': {}}, 'holderRatio': 1.0, 'holderMaxHeight': 700, 'heroImage': {'initial': []}, 'heroVideo': {'initial': []}, 'spin360ColorData': {'initial': {}}, 'spin360ColorEnabled': {'initial': 0}, 'spin360ConfigEnabled': false, 'spin360LazyLoadEnabled': false, 'playVideoInImmersiveView':'false', 'tabbedImmersiveViewTreatment':'T2', 'totalVideoCount':'0', 'videoIngressATFSlateThumbURL':'', 'mediaTypeCount':'0', 'atfEnhancedHoverOverlay' : true, 'winningAsin': 'B00XLSS79Y', 'weblabs' : {}, 'aibExp3Layout' : 1, 'aibRuleName' : 'frank-powered', 'acEnabled' : false }; A.trigger('P.AboveTheFold'); // trigger ATF event. return data; }); 的数据,这样的话我就可以轻松搞定每个URL。

我试图做这样的事情:

colorImages

但似乎m = re.search(r'^var data = ({.*};)', imagesString , re.S | re.M) data = m.group() jsonObj = json.loads(data[:-1].replace("'", '"')) imagesString很好地工作,我把关于re.search不是一个字符串时,它实际上是得到错误。

我用imagesString,这样的事情(re.findall是文本块我从网站得到的),得到了来自亚马逊的页面类似的数据。

script

接着

variationValues = re.findall(r'variationValues\" : ({.*?})', ' '.join(script))[0]

但我的正则表达式的了解其实并不多。

从我上面粘贴的串,我抹去了起点和终点所以只有数据依然存在,所以我留下了这一点:

variationValuesDict = json.loads(variationValues)

我无法弄清楚如何获得https://jsoneditoronline.org/?id=9ea92643044f4ac88bcc3e76d98425fccolorImages(或JSON编辑器中的数据),所以后来我可以加载到JSON和使用它像一本字典,就如何实现这一目标的任何想法?

json regex python-3.x scrapy
1个回答
0
投票

你只需要最初re.findall()转换为正确的标记JSON。这是很容易)))只需更换所有字符var data'和删除空间。你会得到JSON对象:"(这是你的权利JSON)enter image description here

© www.soinside.com 2019 - 2024. All rights reserved.