我正在尝试填充 Azure 搜索索引中的 ParsedDate 字段,以便稍后创建评分配置文件来改进搜索结果(使用新鲜度)。
我根据文档定义了索引、技能组和索引器,但是当我运行索引器时,该字段始终为空。
这就是我如何定义我的技能组中的技能:
{
"@odata.type": "#Microsoft.Skills.Custom.WebApiSkill",
"name": "#0",
"description": "A custom skill that parses dates from file names",
"uri": "https://az-function.azurewebsites.net/api/custom-skill-date-extraction?code=ABC",
"httpMethod": "POST",
"timeout": "PT30S",
"batchSize": 1,
"context": "/document",
"inputs": [
{
"name": "fileName",
"source": "/document/metadata_storage_path"
}
],
"outputs": [
{
"name": "parsedDate",
"targetName": "parsedDate"
}
]
}
这就是我定义索引的方式:
def main(req: func.HttpRequest) -> func.HttpResponse:
logging.info('Python HTTP trigger function processed a request.')
# Environment Variables
endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
endpoint_openai = os.environ["AZURE_OPENAI_ENDPOINT"]
deployment_id = os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT_ID"]
credential_search = AzureKeyCredential(os.environ["AZURE_SEARCH_ADMIN_KEY"]) if os.environ["AZURE_SEARCH_ADMIN_KEY"] else DefaultAzureCredential()
credential = DefaultAzureCredential()
customer = readRequestBody(req)
_ , datasource_name = utils.getStorageAccountInfo(customer, credential)
index_name = utils.get_index_name( datasource_name)
# Logic for creating a search index
try:
index_client = SearchIndexClient(endpoint=endpoint, credential=credential_search)
fields = [
SearchField(name="parent_id", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
SearchField(name="parsedDate", type=SearchFieldDataType.DateTimeOffset, sortable=True, filterable=True, facetable=True),
SearchField(name="title", type=SearchFieldDataType.String),
SearchField(name="chunk_id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True, analyzer_name="keyword"),
SearchField(name="chunk", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),
SearchField(name="vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
]
vector_search = VectorSearch(
algorithms=[
HnswAlgorithmConfiguration(
name="myHnsw",
parameters=HnswParameters(
m=4,
ef_construction=400,
ef_search=500,
metric=VectorSearchAlgorithmMetric.COSINE,
),
),
ExhaustiveKnnAlgorithmConfiguration(
name="myExhaustiveKnn",
parameters=ExhaustiveKnnParameters(
metric=VectorSearchAlgorithmMetric.COSINE,
),
),
],
profiles=[
VectorSearchProfile(
name="myHnswProfile",
algorithm_configuration_name="myHnsw",
vectorizer="myOpenAI",
),
VectorSearchProfile(
name="myExhaustiveKnnProfile",
algorithm_configuration_name="myExhaustiveKnn",
vectorizer="myOpenAI",
),
],
vectorizers=[
AzureOpenAIVectorizer(
name="myOpenAI",
kind="azureOpenAI",
azure_open_ai_parameters=AzureOpenAIParameters(
resource_uri=endpoint_openai,
deployment_id=deployment_id,
api_key=credential_search,
),
),
],
)
semantic_search = SemanticSearch(configurations=[SemanticConfiguration(
name="my-semantic-config",
prioritized_fields=SemanticPrioritizedFields(content_fields=[SemanticField(field_name="chunk"),SemanticField(field_name="title")] )
)])
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
return func.HttpResponse(f"{result.name} created", status_code=200)
except Exception as e:
return func.HttpResponse(f"Failed to create or update the index. Error: {str(e)}", status_code=500)
最后,我如何配置索引器:
Python
def main(req: func.HttpRequest) -> func.HttpResponse:
logging.info('Python HTTP trigger function processed a request.')
# Environment Variables
endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
credential_search = AzureKeyCredential(os.environ["AZURE_SEARCH_ADMIN_KEY"])
customer = readRequestBody(req)
credential = DefaultAzureCredential()
_, data_source_name= utils.getStorageAccountInfo(customer, credential)
index_name = utils.get_index_name(data_source_name)
skillset_name = utils.get_skillset_name(data_source_name)
# Indexer creation logic
try:
indexer_name = f"{data_source_name}-indexer"
indexer = SearchIndexer(
name=indexer_name,
description="Indexer to index documents and generate embeddings",
skillset_name=skillset_name,
target_index_name=index_name,
data_source_name=data_source_name,
field_mappings=[FieldMapping(source_field_name="metadata_storage_name", target_field_name="title"),
FieldMapping(source_field_name="parsedDate", target_field_name="parsedDate")],
parameters=IndexingParameters(
configuration={
"dataToExtract": "contentAndMetadata",
"imageAction": "generateNormalizedImages"
}
)
)
indexer_client = SearchIndexerClient(endpoint, credential_search)
indexer_result = indexer_client.create_or_update_indexer(indexer)
# Run the indexer
indexer_client.run_indexer(indexer_name)
message = f'{indexer_name} is created and running. If queries return no results, please wait a bit and try again.'
logging.info(message)
return func.HttpResponse(message, status_code=200)
except Exception as e:
error_message = f"Failed to create or run the indexer. Error: {str(e)}"
logging.error(error_message)
return func.HttpResponse(error_message, status_code=500)
我也尝试在索引器中使用 out_field_mappings 但无济于事。
任何建议都会很棒。
Azure 人工智能搜索
您将收到
null
值,原因如下:
outputFieldMappings
不正确,例如技能组输出 targetName
中的名称不匹配或索引器定义中的 sourceFieldName
。outputFieldMappings
。示例: 如果在输出
parsedDate
字段中给出 name
并且您返回如下例所示的数据,您将得到一个空值。该字段应与技能集中的输出名称匹配。
{
"values": [
{
"recordId": "0",
"data": {
"parsedate": "2015-01-01T00:00:00.000Z"
}
}
]
}
接下来,在返回值时在 Azure 函数中使用以下代码:
req_body = req.get_json()
values = req_body.get('values')
res = []
for i in values:
tmp = i
tmp['data'] = {'parsedDte': "parsed_date_from_path"} # example: 2015-01-01T00:00:00.000Z
res.append(tmp)
if res:
return func.HttpResponse(json.dumps({"values": res}), mimetype="application/json")
配置
outputFieldMappings
:
"outputs": [
{
"name": "parsedDate",
"targetName": "parsedDate"
}]
对于技能组中的上述输出,您需要提供
outputFieldMappings
,如下所示:
"outputFieldMappings": [
{
"sourceFieldName": "/document/parsedDate", # output of skillset
"targetFieldName": "parsedDate" # Target index field name
}
]
或者用代码:
indexer_name = f"vs-code-2-indexer"
indexer = SearchIndexer(
name=indexer_name,
description="Indexer to index documents and generate embeddings",
skillset_name="skillset1712921532571",
target_index_name="vs-code-2",
data_source_name="hotels-sample",
output_field_mappings=[FieldMapping(source_field_name="/document/parsedDate", target_field_name="parsedDate")]
)
如果上述解决方案不起作用,请删除当前索引并创建具有相同定义的新索引。然后使用上述 outputFieldMappings
重置并
运行索引器。