从无法访问的JsonObject()中提取数据

问题描述 投票:1回答:2

我正试图到达一个JsObject来解决它。

import requests
from bs4 import BeautifulSoup

url ='https://fjrgcwbcbo-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%20(lite)%203.27.0%3Binstantsearch.js%202.8.0%3BJS%20Helper%202.26.0&x-algolia-application-id=FJRGCWBCBO&x-algolia-api-key=a214a1e7afd822b517723830f05e9449'
jsonObj = requests.get(url).json()

print(JsonObj)

此URL中有四个JsonObject,但每个JsonObject似乎都被API阻止。 Website URL

非常感谢。

json beautifulsoup python-requests algolia
2个回答
1
投票

您需要提供查询参数和有效负载以及确定如何接收它的方式很棘手。但这就是诀窍:

import requests
import json

url ='https://fjrgcwbcbo-dsn.algolia.net/1/indexes/*/queries'

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}

params = {
'x-algolia-agent': 'Algolia for vanilla JavaScript (lite) 3.27.0;instantsearch.js 2.8.0;JS Helper 2.26.0',
'x-algolia-application-id': 'FJRGCWBCBO',
'x-algolia-api-key': 'a214a1e7afd822b517723830f05e9449'}

data = {"requests":[{"indexName":"CAPI-agents-stage","params":"query=&hitsPerPage=20&maxValuesPerFacet=10&page=0&filters=source.name%3Atremplin%20AND%20NOT%20source.id%3A400%20AND%20NOT%20source.id%3A2662%20AND%20NOT%20source.id%3A3292%20AND%20NOT%20source.id%3A720%20AND%20NOT%20source.id%3A1%20AND%20NOT%20source.id%3A5167%20AND%20NOT%20source.id%3A177%20AND%20NOT%20source.id%3A7907%20AND%20NOT%20source.id%3A4999%20AND%20NOT%20source.id%3A979%20AND%20NOT%20source.id%3A5538%20AND%20NOT%20source.id%3A8062%20AND%20NOT%20source.id%3A7908%20AND%20NOT%20source.id%3A38%20AND%20NOT%20source.id%3A7957%20AND%20NOT%20source.id%3A8084%20AND%20NOT%20source.id%3A8078%20AND%20NOT%20source.id%3A8158%20AND%20NOT%20source.id%3A9298%20AND%20NOT%20source.id%3A9299%20AND%20NOT%20source.id%3A9368%20AND%20NOT%20source.id%3A540%20AND%20NOT%20source.id%3A7905%20AND%20NOT%20source.id%3A10173&facets=%5B%22identity.last_name%22%2C%22geolocation.city%22%2C%22geolocation.postal_code%22%2C%22geolocation.region%22%2C%22geolocation.department%22%2C%22geolocation.country%22%2C%22specialities%22%5D&tagFilters="}]}
jsondata = json.dumps(data)

jsonObj = requests.post(url, data=jsondata, params=params).json()

print(jsonObj['results'][0])

for each in jsonObj['results'][0]['hits']:
    print (each['email_address'])

额外:

我调整了你可以改变查询/数据参数的方式。

import requests
import urllib

url ='https://fjrgcwbcbo-dsn.algolia.net/1/indexes/*/queries'

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}

params = {
'x-algolia-agent': 'Algolia for vanilla JavaScript (lite) 3.27.0;instantsearch.js 2.8.0;JS Helper 2.26.0',
'x-algolia-application-id': 'FJRGCWBCBO',
'x-algolia-api-key': 'a214a1e7afd822b517723830f05e9449'}


#Utilize a simplier way to input query parameters
query = {
"indexName":"CAPI-agents-stage", 
"hitsPerPage": "20",
"maxValuesPerFacet": "10",
"page": "0",
"filters": "source.name:tremplin AND NOT source.id:400 AND NOT source.id:2662 AND NOT source.id:3292 AND NOT source.id:720 AND NOT source.id:1 AND NOT source.id:5167 AND NOT source.id:177 AND NOT source.id:7907 AND NOT source.id:4999 AND NOT source.id:979 AND NOT source.id:5538 AND NOT source.id:8062 AND NOT source.id:7908 AND NOT source.id:38 AND NOT source.id:7957 AND NOT source.id:8084 AND NOT source.id:8078 AND NOT source.id:8158 AND NOT source.id:9298 AND NOT source.id:9299 AND NOT source.id:9368 AND NOT source.id:540 AND NOT source.id:7905 AND NOT source.id:10173",
"facets": str(["identity.last_name","geolocation.city","geolocation.postal_code","geolocation.region","geolocation.department","geolocation.country","specialities"]),
"tagFilters": ""}

#Create the string for the data parameter using the urllib parse quote
data = '''{"requests": [{"indexName": "%s", "params": "query=&hitsPerPage=%s&maxValuesPerFacet=%s&page=%s&filters=%s&facets=%s&tagFilters=%s"}]}''' %(query['indexName'],query['hitsPerPage'],query['maxValuesPerFacet'],query['page'],urllib.parse.quote(query['filters']),urllib.parse.quote(query['facets']),query['tagFilters'])

jsonObj = requests.post(url, data=data, params=params).json()


print(jsonObj['results'][0])

for each in jsonObj['results'][0]['hits']:
    print (each['email_address'])

0
投票

我发现另一种解决方案是一次性废弃所有电子邮件地址。

import requests
import json
from algoliasearch import algoliasearch

url ='https://fjrgcwbcbo- 
dsn.algolia.net/1/indexes/*/queries'

client = algoliasearch.Client("FJRGCWBCBO", "a214a1e7afd822b517723830f05e9449")
index = client.init_index('CAPI-agents-stage')

for hit in index.browse_all({"query": ""}):
print(hit['email_address'])

如果它可以帮助任何人,非常感谢Chitown88

© www.soinside.com 2019 - 2024. All rights reserved.