Azure AI 搜索文本拆分技能调试会话错误

问题描述 投票:0回答:1

我正在尝试在我的 AI 丰富管道中实施文本分割认知技能。

我的技能组定义如下:

{
  "@odata.context": "https://<redacted>/$metadata#skillsets/$entity",
  "@odata.etag": "\"<redacted>\"",
  "name": "documentindexingskillset",
  "description": "",
  "skills": [
    {
      "@odata.type": "#Microsoft.Skills.Util.DocumentExtractionSkill",
      "name": "doc_extract_skill",
      "description": "",
      "context": "/document",
      "parsingMode": "default",
      "dataToExtract": "contentAndMetadata",
      "inputs": [
        {
          "name": "file_data",
          "source": "/document/file_data"
        }
      ],
      "outputs": [
        {
          "name": "content",
          "targetName": "extracted_content"
        },
        {
          "name": "normalized_images",
          "targetName": "extracted_normalized_images"
        }
      ],
      "configuration": {
        "imageAction": "generateNormalizedImages",
        "[email protected]": "#Int64",
        "normalizedImageMaxWidth": 2000,
        "[email protected]": "#Int64",
        "normalizedImageMaxHeight": 2000
      }
    },
    {
      "@odata.type": "#Microsoft.Skills.Vision.OcrSkill",
      "name": "doc_ocr_skill",
      "description": "Extracts text (plain and structured) from image.",
      "context": "/document/extracted_normalized_images/*",
      "textExtractionAlgorithm": null,
      "lineEnding": "Space",
      "defaultLanguageCode": "en",
      "detectOrientation": true,
      "inputs": [
        {
          "name": "image",
          "source": "/document/extracted_normalized_images/*"
        }
      ],
      "outputs": [
        {
          "name": "text",
          "targetName": "imageText"
        },
        {
          "name": "layoutText",
          "targetName": "imageLayoutText"
        }
      ]
    },
    {
      "@odata.type": "#Microsoft.Skills.Text.MergeSkill",
      "name": "doc_merge_skill",
      "description": "Create merged_text, which includes all the textual representation of each image inserted at the right location in the content field.",
      "context": "/document",
      "insertPreTag": " ",
      "insertPostTag": " ",
      "inputs": [
        {
          "name": "text",
          "source": "/document/extracted_content"
        },
        {
          "name": "itemsToInsert",
          "source": "/document/extracted_normalized_images/*/imageText"
        }
      ],
      "outputs": [
        {
          "name": "mergedText",
          "targetName": "mergedText"
        }
      ]
    },
    {
      "@odata.type": "#Microsoft.Skills.Text.LanguageDetectionSkill",
      "name": "doc_language_detection",
      "description": "",
      "context": "/document",
      "defaultCountryHint": "",
      "modelVersion": null,
      "inputs": [
        {
          "name": "text",
          "source": "/document/mergedText"
        }
      ],
      "outputs": [
        {
          "name": "languageCode",
          "targetName": "languageCode"
        },
        {
          "name": "languageName",
          "targetName": "languageName"
        },
        {
          "name": "score",
          "targetName": "score"
        }
      ]
    },
    {
      "@odata.type": "#Microsoft.Skills.Text.SplitSkill",
      "name": "doc_text_split",
      "description": "",
      "context": "/document",
      "defaultLanguageCode": "en",
      "textSplitMode": "pages",
      "maximumPageLength": 1000,
      "pageOverlapLength": 100,
      "maximumPagesToTake": 0,
      "inputs": [
        {
          "name": "text",
          "source": "/document/mergedText"
        },
        {
          "name": "languageCode",
          "source": "/document/languageCode"
        }
      ],
      "outputs": [
        {
          "name": "textItems",
          "targetName": "mypages"
        }
      ]
    }
  ],
  "cognitiveServices": {
    "@odata.type": "#Microsoft.Azure.Search.DefaultCognitiveServices",
    "description": null
  },
  "knowledgeStore": null,
  "indexProjections": null,
  "encryptionKey": null
}

我的索引器定义如下:

{
  "@odata.context": "<redacted>/$metadata#indexers/$entity",
  "@odata.etag": "\"<redacted>\"",
  "name": "cs-ai-uks-01-ixr-02",
  "description": null,
  "dataSourceName": "ds-cs-ai-uks-02-saaiuksstg01",
  "skillsetName": "documentindexingskillset",
  "targetIndexName": "cs-ai-uks-02-is-02",
  "disabled": null,
  "schedule": null,
  "parameters": {
    "batchSize": null,
    "maxFailedItems": null,
    "maxFailedItemsPerBatch": null,
    "base64EncodeKeys": null,
    "configuration": {
      "imageAction": "generateNormalizedImages",
      "allowSkillsetToReadFileData": true
    }
  },
  "fieldMappings": [],
  "outputFieldMappings": [
    {
      "sourceFieldName": "/document/mypages",
      "targetFieldName": "Content"
    }
  ],
  "cache": null,
  "encryptionKey": null
}

最后,我的索引定义如下:

{
  "@odata.context": "<redacted>/$metadata#indexes/$entity",
  "@odata.etag": "\"<redacted>\"",
  "name": "cs-ai-uks-02-is-02",
  "defaultScoringProfile": null,
  "fields": [
    {
      "name": "id",
      "type": "Edm.String",
      "searchable": false,
      "filterable": false,
      "retrievable": true,
      "sortable": false,
      "facetable": false,
      "key": true,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": null,
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    },
    {
      "name": "Content",
      "type": "Edm.String",
      "searchable": true,
      "filterable": false,
      "retrievable": true,
      "sortable": false,
      "facetable": false,
      "key": false,
      "indexAnalyzer": null,
      "searchAnalyzer": null,
      "analyzer": "standard.lucene",
      "normalizer": null,
      "dimensions": null,
      "vectorSearchProfile": null,
      "synonymMaps": []
    }
  ],
  "scoringProfiles": [],
  "corsOptions": null,
  "suggesters": [],
  "analyzers": [],
  "normalizers": [],
  "tokenizers": [],
  "tokenFilters": [],
  "charFilters": [],
  "encryptionKey": null,
  "similarity": {
    "@odata.type": "#Microsoft.Azure.Search.BM25Similarity",
    "k1": null,
    "b": null
  },
  "semantic": null,
  "vectorSearch": null
}

我的 AI 搜索服务当前配置为基本定价层 (SKU:B)。

我遇到两个问题:

  1. 当我运行索引器时,它成功完成,存储容器中的所有文档都被索引,但文本没有被分割。 “maximumPageLength”和“pageOverlapLength”参数分别设置为 1000 和 100,但索引文档的“内容”字段中的文本包含的文本远多于此。根据这篇文章
  2. ,我预计内容字段包含大约 1000 个字符
  3. 当我尝试配置调试会话时,保存调试会话时出现以下错误:

无法创建调试会话“new-debug-session”,错误:“InvalidSkillset:一项或多项技能无效。详细信息:在 Skill 上发现意外属性。参数:Debug.Skillset”

当我从技能集中删除文本拆分技能时,我可以毫无问题地保存并运行调试会话。

我不明白“技能上发现意外属性。参数:Debug.Skillset”消息,因为没有定义此类参数?

如果有任何意见,我将不胜感激,并提前致谢。

问候。

azure search text split artificial-intelligence
1个回答
0
投票

即使当我尝试在不使用文本拆分技能的索引器上运行调试会话时,我也会遇到相同的错误

© www.soinside.com 2019 - 2024. All rights reserved.