如何在父元数据的 Azure AI 矢量搜索中过滤分块 pdf?

问题描述 投票:0回答:1

我刚开始使用 Azure 上的搜索功能,并且为了使我的块可过滤而付出了很大的努力。这是我的情况: 我有一个 blob,在不同的文件夹中包含数百个 pdf 文件。每个 blob 都有一个元数据列和存储在其中的值的列表。为了能够按语义搜索它们,我使用 Azure 门户并单击“导入和矢量化”按钮应用了开箱即用的矢量搜索。这将设置用于分块的技能组、索引器并创建索引。由此,我可以搜索我的数据,并对其进行很好的过滤。问题是来自父文档的元数据并未存储在每个块中,因此我在运行矢量搜索时无法过滤块。

我一生都无法弄清楚如何将父元数据映射到每个块!谁能帮我这个?还有更好的方法吗?比如直接在父文档上过滤?我正在寻找绝对最简单的实现。

提前致谢。

我尝试了索引器、拆分器等的一些变体,但主要以与“标题”完全相同的方式定义索引中的元数据列,然后尝试将名称“metadata_prosjektnummer”映射到索引器中的“prosjektnummer”(我不知道这是否是正确的命名约定,blob-pdf 中的元数据简称为“prosjektnummer”)。由于某种原因,这总是会产生 null。我单击索引器上的“运行”按钮进行调试,它似乎并不总是执行任何索引,我得到 0/0。如果这是罪魁祸首,当我更改 JSON 时,我将如何重新运行索引器?

以下是我的定义。请忽略对姓名的引用,因为我已尝试匿名。但元数据保持不变。

索引器:

{
  "@odata.context": "https://documentsearch.search.windows.net/$metadata#indexers/$entity",
  "@odata.etag": "...",
  "name": "name",
  "description": null,
  "dataSourceName": "datasourcename",
  "skillsetName": "skillsetname",
  "targetIndexName": "indexname",
  "disabled": null,
  "schedule": null,
  "parameters": {
    "batchSize": null,
    "maxFailedItems": null,
    "maxFailedItemsPerBatch": null,
    "base64EncodeKeys": null,
    "configuration": {
      "dataToExtract": "contentAndMetadata",
      "parsingMode": "default"
    }
  },
  "fieldMappings": [
    {
      "sourceFieldName": "metadata_storage_name",
      "targetFieldName": "title",
      "mappingFunction": null
    },
    {
      "sourceFieldName": "prosjektnummer",
      "targetFieldName": "prosjektnummer",
      "mappingFunction": null
    }
  ],
  "outputFieldMappings": [],
  "cache": null,
  "encryptionKey": null
}

索引:

{
  "@odata.context": "https://documentsearch.search.windows.net/$metadata#indexes/$entity",
  "@odata.etag": "",
  "name": "name",
  "defaultScoringProfile": null,
  "fields": [
    {
      "name": "chunk_id",
      "type": "Edm.String",
      "searchable": true,
      "filterable": true,
      "retrievable": true,
      "stored": true,
      "sortable": true,
      "facetable": true,
      "key": true,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": "keyword",
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "parent_id",
      "type": "Edm.String",
      "searchable": true,
      "filterable": true,
      "retrievable": true,
      "stored": true,
      "sortable": true,
      "facetable": true,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "chunk",
      "type": "Edm.String",
      "searchable": true,
      "filterable": false,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "title",
      "type": "Edm.String",
      "searchable": true,
      "filterable": true,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "prosjektnummer",
      "type": "Edm.String",
      "searchable": true,
      "filterable": true,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "vector",
      "type": "Collection(Edm.Single)",
      "searchable": true,
      "filterable": false,
      "retrievable": true,
      "stored": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": 1536,
      "vectorSearchProfile": "vectorprofilename",
      "synonymMaps": []
    }
  ],
  "scoringProfiles": [],
  "corsOptions": null,
  "suggesters": [],
  "analyzers": [],
  "normalizers": [],
  "tokenizers": [],
  "tokenFilters": [],
  "charFilters": [],
  "encryptionKey": null,
  "similarity": {
    "@odata.type": "#Microsoft.Azure.Search.BM25Similarity",
    "k1": null,
    "b": null
  },
  "semantic": null,
  "vectorSearch": {
    "algorithms": [
      {
        "name": "vector algorithm",
        "kind": "hnsw",
        "hnswParameters": {
          "metric": "cosine",
          "m": 4,
          "efConstruction": 400,
          "efSearch": 500
        },
        "exhaustiveKnnParameters": null
      }
    ],
    "profiles": [
      {
        "name": "vectorprofilename",
        "algorithm": "vectoralgorithmname",
        "vectorizer": "vector-vectorizer",
        "compression": null
      }
    ],
    "vectorizers": [
      {
        "name": "vector-vectorizer",
        "kind": "azureOpenAI",
        "azureOpenAIParameters": {
          "resourceUri": "https://documentsearch-oai.openai.azure.com",
          "deploymentId": "document-embedding",
          "apiKey": "<redacted>",
          "authIdentity": null
        },
        "customWebApiParameters": null
      }
    ],
    "compressions": []
  }
}

技能:

{
  "@odata.context": "https://documentsearch.search.windows.net/$metadata#skillsets/$entity",
  "@odata.etag": "",
  "name": "vector-skillset",
  "description": "Skillset to chunk documents and generate embeddings",
  "skills": [
    {
      "@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
      "name": "#1",
      "description": null,
      "context": "/document/pages/*",
      "resourceUri": "https://.openai.azure.com",
      "apiKey": "<redacted>",
      "deploymentId": "tool-embedding",
      "inputs": [
        {
          "name": "text",
          "source": "/document/pages/*"
        }
      ],
      "outputs": [
        {
          "name": "embedding",
          "targetName": "vector"
        }
      ],
      "authIdentity": null
    },
    {
      "@odata.type": "#Microsoft.Skills.Text.SplitSkill",
      "name": "#2",
      "description": "Split skill to chunk documents",
      "context": "/document",
      "defaultLanguageCode": "en",
      "textSplitMode": "pages",
      "maximumPageLength": 2000,
      "pageOverlapLength": 500,
      "maximumPagesToTake": 0,
      "inputs": [
        {
          "name": "text",
          "source": "/document/content"
        }
      ],
      "outputs": [
        {
          "name": "textItems",
          "targetName": "pages"
        }
      ]
    }
  ],
  "cognitiveServices": null,
  "knowledgeStore": null,
  "indexProjections": {
    "selectors": [
      {
        "targetIndexName": "vector-menonrag",
        "parentKeyFieldName": "parent_id",
        "sourceContext": "/document/pages/*",
        "mappings": [
          {
            "name": "chunk",
            "source": "/document/pages/*",
            "sourceContext": null,
            "inputs": []
          },
          {
            "name": "vector",
            "source": "/document/pages/*/vector",
            "sourceContext": null,
            "inputs": []
          },
          {
            "name": "title",
            "source": "/document/metadata_storage_name",
            "sourceContext": null,
            "inputs": []
          }
        ]
      }
    ],
    "parameters": {
    "projectionMode": "skipIndexingParentDocuments"
  },
  "encryptionKey": null
}
python azure azure-cognitive-search azure-ai
1个回答
0
投票

我对此没有一个巧妙的解决方案。但我通过确保每个元数据列都映射到拆分器中的每个块来解决这个问题:

{
          "name": "prosjektnavn",
            "source": "/document/prosjektnavn",
            "sourceContext": null,
            "inputs": []
          }

当然也将所有这些都包含在索引中。

© www.soinside.com 2019 - 2024. All rights reserved.