我有一个数据库,其中包含图像名称和哈希值。我使用 bit_count 运算符和按位异或运算来计算汉明距离,然后计算图像的相似度指数,为此我进一步处理它们。
对于生产,我们使用更大的数据库,包含大约 1000 万张图像及其哈希值,并且使用 elasticsearch 似乎是最快的,但我在弄清楚如何使用 Elasticsearch_dsl 将其转换为 Python 中的 Elasticsearch 查询时遇到了一些麻烦
SELECT imagename, hex(hash), LENGTH(hex(hash)) AS "hash_length", bit_count(hash ^ 0x1800387EE3414302) AS "hamming_distance", 100 * (1 - bit_count(hash ^ 0x1800387EE3414302) / LENGTH(hex(hash))) AS "similarity_index" FROM hashes WHERE 100 * (1 - bit_count(hash ^ 0x1800387EE3414302) / LENGTH(hex(hash))) >= 75 AND LENGTH(hex(hash)) = LENGTH('1800387EE3414302');
request = Search(using=client, index="hash")
def createSimIndex():
script_lang = "painless"
script_name = "simindex"
script_def = """
def targetHash = params.targetHash; // Your target hash in decimal format
def documentHash = doc['hash'].value;
def hammingDistance = 0;
for (int i = 0; i < 64.0; i++) { // Assuming unsigned_long is 64 bits
if (((documentHash >> i) & 1) != ((targetHash >> i) & 1)) {
hammingDistance++;
}
}
// Calculate similarity index as 1 - (hammingDistance / 64)
def similarityIndex = 1 - (hammingDistance / 64.0);
return similarityIndex;
"""
script_def = {
"script": {
"lang": script_lang,
"source": script_def
}
}
client.put_script(id=script_name, body=script_def)
request = request.script_fields(
simindex = {
"script": {
"id": "simindex",
"params": {
"targetHash": targetHash[0]
}
}
},
hash = {
"script": {
"id": "giveHash"
}
}
)
response = request.scan()
在这里,我遇到了一些问题,需要更多帮助:
我确实设法使其在 Elasticsearch Kibana 中工作,但我不知道如何继续。
POST _scripts/hammingdistance
{
"script": {
"lang": "painless",
"source": """
def targetHash = params.targetHash; // Your target hash in decimal format
def dbHash = doc['hash'].value;
def hammingDistance = 0;
for (int i = 0; i < 64.0; i++) { // Assuming unsigned_long is 64 bits
if (((dbHash >> i) & 1) != ((targetHash >> i) & 1)) {
hammingDistance++;
}
}
//double hammingDistance = documentHash ^ targetHash;
// Check if the hashes are identical
if (dbHash == targetHash) {
return 0.0;
}
// Calculate similarity index as 1 - (hammingDistance / 64)
def similarityIndex = 1 - (hammingDistance / 64.0);
return similarityIndex;
"""
}
}
GET hash/_search
{
"query": {
"script_score": {
"query": {
"match_all": {}
},
"script": {
"id": "hammingdistance",
"params": {
"targetHash": 9073290683095160
}
}
}
},
"sort": [
{
"_score": {
"order": "desc"
}
}
],
"min_score": 0.75
}
与 Elasticsearch DSL 相比,最终在 Python 中使用了普通的 Elasticsearch 库,并且刚刚在 Python 中传递了整个查询。
使用此函数在弹性内部创建脚本(与相关函数类似)
def createScript():
script_lang = "painless"
script_name = "hammingdistance"
script_def = """
def targetHash = params.targetHash; // Your target hash in decimal format
def documentHash = doc['hash'].value;
def hammingDistance = 0;
for (int i = 0; i < 64.0; i++) { // Assuming unsigned_long is 64 bits
if (((documentHash >> i) & 1) != ((targetHash >> i) & 1)) {
hammingDistance++;
}
}
// Calculate similarity index as 1 - (hammingDistance / 64)
def similarityIndex = 1 - (hammingDistance / 64.0);
return similarityIndex;
"""
script_def = {
"script": {
"lang": script_lang,
"source": script_def
}
}
es.put_script(id=script_name, body=script_def)
在elasticsearch-py中排序和搜索
search_param = {
"query": {
"script_score": {
"query": {
"match_all": {
}
},
"script": {
"id": "hammingdistance",
"params": {
"targetHash": target
}
}
}
},
"sort": [
{
"_score": {
"order": "desc"
}
}
],
"min_score": "0.5"
}
response = elastic_client.search(index='hash', body=search_param)
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(response["hits"]["hits"])
print("Highest similarity index: " + str(response["hits"]["max_score"]))