Vespa 在查询期间无法识别嵌入 ID。即使它位于 vald embedders 列表中

问题描述 投票:0回答:1
Error while searching:- [{'code': 3, 'summary': 'Illegal query', 'message': "Could not set 'ranking.features.query(query_embedding)' to 'embed(e5, What is the Goodwill as of December 2023 in SUBSIDIARIES CONSOLIDATED BALANCE SHEETS?)': Multiple embedders are provided but no embedder id is given. Valid embedders are colbert,e5"}]

问题仅出现在检索时,如果查询与嵌入有关,则无法嵌入查询字符串。

但是我尝试了一个简单的查询来查看数据中是否确实存在嵌入,确实有。

不知道为什么它能够在索引文档时识别它,但不能在查询中识别它。

映射:-

self.app_package = ApplicationPackage(name=self.app_name)
# self.app_package.schema.mode = "streaming"
self.meta_variables = ['doc_id','document_name', 'type', 'reportedTime', 'period', 'IsNro', 'pageNumber', 'language', 'company_ID', 'company_name', 'company_ticker', 'company_countryCode', 'company_quantum', 'company_currency', 'company_fiscalYear', 'company_fyAdjustment']
self.app_package.schema.add_fields(
    Field(
        name="text", type="string", indexing=["index", "summary"], index="enable-bm25"
    ),
    Field(
        name="embedding",
        type="tensor<float>(x[1024])",
        indexing=["input text", "embed e5","attribute", "summary", "index"],
        attribute=["distance-metric: angular"],
        is_document_field=False
    ),
    Field(
        name="colbert",
        type="tensor<float>(dt{}, x[128])",
        indexing=["input text", "embed colbert","attribute", "summary", "index"],
        attribute=["distance-metric: angular"],
        is_document_field=False
    ),
    Field(name="doc_id", type="int", indexing=["attribute", "summary"]),
    Field(name="document_name", type="string", indexing=["attribute", "summary"], match=['word']),
    Field(name="type", type="string", indexing=["attribute", "summary"], match=['exact']),
    Field(name="reportedTime", type="string", indexing=["attribute", "summary"], match=['word']),
    Field(name="period", type="string", indexing=["attribute", "summary"], match=['exact']),
    Field(name="IsNro", type="bool", indexing=["attribute", "summary"]),
    Field(name="pageNumber", type="int", indexing=["attribute", "summary"]),
    Field(name="language", type="string", indexing=["attribute", "summary"], match=['exact']),
    Field(name="company_ID", type="int", indexing=["attribute", "summary"]),
    Field(name="company_name", type="string", indexing=["attribute", "summary"], match=['exact']),
    Field(name="company_ticker", type="string", indexing=["attribute", "summary"], match=['exact']),
    Field(name="company_countryCode", type="string", indexing=["attribute", "summary"], match=['exact']),
    Field(name="company_quantum", type="string", indexing=["attribute", "summary"], match=['exact']),
    Field(name="company_currency", type="string", indexing=["attribute", "summary"], match=['exact']),
    Field(name="company_fiscalYear", type="string", indexing=["attribute", "summary"], match=['exact']),
    Field(name="company_fyAdjustment", type="bool", indexing=["attribute", "summary"], match=['exact']),
)
self.app_package.schema.add_rank_profile(
    RankProfile(
        name="default",
        first_phase="closeness(field, embedding)",
        inputs=[("query(query_embedding)", "tensor<float>(x[1024])")],
    )
)
self.app_package.schema.add_rank_profile(
    RankProfile(
        name="combined_ranking",
        first_phase="cos_sim",
        second_phase=SecondPhaseRanking(expression="0.05 * bm25(text) + 0.15 * cos_sim + 0.8 * max_sim", rerank_count=10),
        # global_phase=GlobalPhaseRanking(expression="0.05 * bm25(text) + 0.25 * cos_sim + 0.7 * max_sim"),
        functions=[Function(name="unpack", expression="cell_cast(attribute(colbert), float)"),Function(name="cos_sim", expression="cosine_similarity(query(query_embedding), attribute(embedding),x)"),Function(
    name="max_sim",
    expression="""sum(
reduce(
sum(
    query(qt) * attribute(colbert) , x
),
max, dt
),
qt
)/32.0
"""
)],
        inputs=[
            ("query(query_embedding)", "tensor<float>(x[1024])"),
            ("query(qt)", "tensor<float>(qt{}, x[128])")
        ],
        match_features=["max_sim", "cos_sim", "bm25(text)"]
    )
)
self.app_package.components = [Component(id="colbert", type="colbert-embedder",
    parameters=[
        Parameter("transformer-model", {"url": "https://huggingface.co/mixedbread-ai/mxbai-colbert-large-v1/resolve/main/onnx/model.onnx?download=true"}),
        Parameter("tokenizer-model", {"url": "https://huggingface.co/mixedbread-ai/mxbai-colbert-Large-v1/raw/main/tokenizer.json"})
    ]
),
    Component(id="e5", type="hugging-face-embedder",
    parameters=[
        Parameter("transformer-model", {"url": "https://huggingface.co/BAAI/bge-large-en-v1.5/resolve/main/onnx/model.onnx?download=true"}),
        Parameter("tokenizer-model", {"url": "https://huggingface.co/BAAI/bge-large-en-v1.5/resolve/main/tokenizer.json"})
    ]
)]

出现此错误的查询:-

def get_top_para_finance(self, query: str, doc_id: int):
    # print(self.vespa_app.application_package.get_model(model_id='colbert'))
    with self.vespa_app.syncio(connections=12) as session:
        start = time.time()
        print(f"Got the Query:- {query}")
        st = time.time()
        # embeddings = self.vespa_obj.embedding_function.embed_query(query)
        print(f"Time to get the Embeddings:- {round(time.time()-st, 2)}s")
        result =  self.vespa_app.query(
            yql="select * from sources * where {targetHits: 10}nearestNeighbor(embedding, query_embedding) and doc_id = "+ f"{doc_id}",
            query=query,
            ranking= "default",
            body={
                "input.query(qt)": f"embed(colbert, {query})",
                "input.query(query_embedding)": f"embed(e5, {query})",
            },
            hits = 1,
            # timeout = "1ms"
        )
        assert(result.is_successfull())
        end = time.time()
        total_time = round(end-start, 2)
        print(f"Search time:- {total_time}s")
        return self.display_hits_as_df(result, self.vespa_obj.meta_variables+['text']), total_time
python database full-text-search vespa vector-database
1个回答
0
投票

您需要引用您要嵌入的文本:

body={
                "input.query(qt)": f"embed(colbert, \"{query}\")",
                "input.query(query_embedding)": f"embed(e5, \"{query}\")",
            },

这里的错误消息肯定会更清楚 - 我会解决这个问题。

© www.soinside.com 2019 - 2024. All rights reserved.