在 AWS Lambda 中转换适用于 Sagemaker 的 RAG 方法代码时出现问题

Question

我们在将 RAG 方法转换为 Lambda 代码时遇到问题。我们正在关注这个 jupyter 笔记本：https://github.com/pinecone-io/examples/blob/master/learn/ Generation/aws/sagemaker/sagemaker-llama-2-rag.ipynb

以下是上述笔记本中的一些代码片段，我们在转换为 Lambda 时遇到了问题：

from typing import List


def embed_docs(docs: List[str]) -> List[List[float]]:
    out = encoder.predict({"inputs": docs})
    embeddings = np.mean(np.array(out), axis=1)
    return embeddings.tolist()

s3_path = f"s3://jumpstart-cache-prod-us-east-2/training-datasets/Amazon_SageMaker_FAQs/Amazon_SageMaker_FAQs.csv"

# Downloading the Database
!aws s3 cp $s3_path Amazon_SageMaker_FAQs.csv

import pandas as pd

df_knowledge = pd.read_csv("Amazon_SageMaker_FAQs.csv", header=None, names=["Question", "Answer"])
df_knowledge.head()

df_knowledge.drop(["Question"], axis=1, inplace=True)
df_knowledge.head()

import os
from pinecone import Pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'

# configure client
pc = Pinecone(api_key=api_key)

pc.list_indexes().names()

from pinecone import ServerlessSpec

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

index_name = "llama-2-7b-example"

import time

# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=embeddings.shape[1],
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

from tqdm.auto import tqdm

batch_size = 2  # can increase but needs larger instance size otherwise instance runs out of memory
vector_limit = 1000

answers = df_knowledge[:vector_limit]
index = pc.Index(index_name)

for i in tqdm(range(0, len(answers), batch_size)):
    # find end of batch
    i_end = min(i + batch_size, len(answers))
    # create IDs batch
    ids = [str(x) for x in range(i, i_end)]
    # create metadata batch
    metadatas = [{"text": text} for text in answers["Answer"][i:i_end]]
    # create embeddings
    texts = answers["Answer"][i:i_end].tolist()
    embeddings = embed_docs(texts)
    # create records list for upsert
    records = zip(ids, embeddings, metadatas)
    # upsert to Pinecone
    index.upsert(vectors=records)

# check number of records in the index
index.describe_index_stats()

# extract embeddings for the questions
query_vec = embed_docs(question)[0]

# query pinecone
res = index.query(vector=query_vec, top_k=1, include_metadata=True)

# show the results
res

contexts = [match.metadata["text"] for match in res.matches]

max_section_len = 1000
separator = "\n"


def construct_context(contexts: List[str]) -> str:
    chosen_sections = []
    chosen_sections_len = 0

    for text in contexts:
        text = text.strip()
        # Add contexts until we run out of space.
        chosen_sections_len += len(text) + 2
        if chosen_sections_len > max_section_len:
            break
        chosen_sections.append(text)
    concatenated_doc = separator.join(chosen_sections)
    print(
        f"With maximum sequence length {max_section_len}, selected top {len(chosen_sections)} document sections: \n{concatenated_doc}"
    )
    return concatenated_doc

context_str = construct_context(contexts=contexts)

def create_payload(question, context_str) -> dict:
    prompt_template = """Answer the following QUESTION based on the CONTEXT
    given. If you do not know the answer and the CONTEXT doesn't
    contain the answer truthfully say "I don't know".

    CONTEXT:
    {context}


    ANSWER:
    """

    text_input = prompt_template.replace("{context}", context_str).replace("{question}", question)

    payload = {
        "inputs":  
          [
            [
             {"role": "system", "content": text_input},
             {"role": "user", "content": question},
            ]   
          ],
       "parameters":{"max_new_tokens": 256, "top_p": 0.9, "temperature": 0.6, "return_full_text": False}
    }
    return(payload)

payload = create_payload(question, context_str)
out = predictor.predict(payload, custom_attributes='accept_eula=true')
generated_text = out[0]['generation']['content']
print(f"[Input]: {question}\n[Output]: {generated_text}")

def rag_query(question: str) -> str:
    # create query vec
    query_vec = embed_docs(question)[0]
    # query pinecone
    res = index.query(vector=query_vec, top_k=5, include_metadata=True)
    # get contexts
    contexts = [match.metadata["text"] for match in res.matches]
    # build the multiple contexts string
    context_str = construct_context(contexts=contexts)
    # create our retrieval augmented prompt
    payload = create_payload(question, context_str)
    # make prediction
    out = predictor.predict(payload, custom_attributes='accept_eula=true')
    return out[0]["generation"]["content"]

rag_query("Does SageMaker support spot instances?")

首先，我们使用这个基于 sagemaker 的笔记本部署了 Llama-2 Jumpstart 模型和 Huggingface 的句子嵌入模型。现在，我们在 AWS Lambda 中执行此操作，我们执行以下操作：从 s3 存储桶导入 CSV 文件，然后在提供 pinecone 凭证后，我们从 jupyter 笔记本中获取代码片段并复制函数进入拉姆达。将相同的代码从笔记本复制到 Lambda 后，我们在根据笔记本中的代码片段编写 Lambda 处理函数时遇到了麻烦。

我们期望的是 Llama 根据我们提供的查询做出响应，并且我们期望查询使用 RAG 方法并为我们提供适合我们提供代码的文档的响应。

感谢您的所有帮助。非常感谢。

Answer 1

您能否添加更多具体信息来说明您遇到的错误？（例如，错误消息、API 失败、调用堆栈等）。

在 AWS Lambda 中转换适用于 Sagemaker 的 RAG 方法代码时出现问题

问题描述投票：0回答：1

1个回答

最新问题

在 AWS Lambda 中转换适用于 Sagemaker 的 RAG 方法代码时出现问题

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1