我们在将 RAG 方法转换为 Lambda 代码时遇到问题。 我们正在关注这个 jupyter 笔记本:https://github.com/pinecone-io/examples/blob/master/learn/ Generation/aws/sagemaker/sagemaker-llama-2-rag.ipynb
以下是上述笔记本中的一些代码片段,我们在转换为 Lambda 时遇到了问题:
from typing import List
def embed_docs(docs: List[str]) -> List[List[float]]:
out = encoder.predict({"inputs": docs})
embeddings = np.mean(np.array(out), axis=1)
return embeddings.tolist()
s3_path = f"s3://jumpstart-cache-prod-us-east-2/training-datasets/Amazon_SageMaker_FAQs/Amazon_SageMaker_FAQs.csv"
# Downloading the Database
!aws s3 cp $s3_path Amazon_SageMaker_FAQs.csv
import pandas as pd
df_knowledge = pd.read_csv("Amazon_SageMaker_FAQs.csv", header=None, names=["Question", "Answer"])
df_knowledge.head()
df_knowledge.drop(["Question"], axis=1, inplace=True)
df_knowledge.head()
import os
from pinecone import Pinecone
# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'
# configure client
pc = Pinecone(api_key=api_key)
pc.list_indexes().names()
from pinecone import ServerlessSpec
cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'
spec = ServerlessSpec(cloud=cloud, region=region)
index_name = "llama-2-7b-example"
import time
# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
# if does not exist, create index
pc.create_index(
index_name,
dimension=embeddings.shape[1],
metric='cosine',
spec=spec
)
# wait for index to be initialized
while not pc.describe_index(index_name).status['ready']:
time.sleep(1)
# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()
from tqdm.auto import tqdm
batch_size = 2 # can increase but needs larger instance size otherwise instance runs out of memory
vector_limit = 1000
answers = df_knowledge[:vector_limit]
index = pc.Index(index_name)
for i in tqdm(range(0, len(answers), batch_size)):
# find end of batch
i_end = min(i + batch_size, len(answers))
# create IDs batch
ids = [str(x) for x in range(i, i_end)]
# create metadata batch
metadatas = [{"text": text} for text in answers["Answer"][i:i_end]]
# create embeddings
texts = answers["Answer"][i:i_end].tolist()
embeddings = embed_docs(texts)
# create records list for upsert
records = zip(ids, embeddings, metadatas)
# upsert to Pinecone
index.upsert(vectors=records)
# check number of records in the index
index.describe_index_stats()
# extract embeddings for the questions
query_vec = embed_docs(question)[0]
# query pinecone
res = index.query(vector=query_vec, top_k=1, include_metadata=True)
# show the results
res
contexts = [match.metadata["text"] for match in res.matches]
max_section_len = 1000
separator = "\n"
def construct_context(contexts: List[str]) -> str:
chosen_sections = []
chosen_sections_len = 0
for text in contexts:
text = text.strip()
# Add contexts until we run out of space.
chosen_sections_len += len(text) + 2
if chosen_sections_len > max_section_len:
break
chosen_sections.append(text)
concatenated_doc = separator.join(chosen_sections)
print(
f"With maximum sequence length {max_section_len}, selected top {len(chosen_sections)} document sections: \n{concatenated_doc}"
)
return concatenated_doc
context_str = construct_context(contexts=contexts)
def create_payload(question, context_str) -> dict:
prompt_template = """Answer the following QUESTION based on the CONTEXT
given. If you do not know the answer and the CONTEXT doesn't
contain the answer truthfully say "I don't know".
CONTEXT:
{context}
ANSWER:
"""
text_input = prompt_template.replace("{context}", context_str).replace("{question}", question)
payload = {
"inputs":
[
[
{"role": "system", "content": text_input},
{"role": "user", "content": question},
]
],
"parameters":{"max_new_tokens": 256, "top_p": 0.9, "temperature": 0.6, "return_full_text": False}
}
return(payload)
payload = create_payload(question, context_str)
out = predictor.predict(payload, custom_attributes='accept_eula=true')
generated_text = out[0]['generation']['content']
print(f"[Input]: {question}\n[Output]: {generated_text}")
def rag_query(question: str) -> str:
# create query vec
query_vec = embed_docs(question)[0]
# query pinecone
res = index.query(vector=query_vec, top_k=5, include_metadata=True)
# get contexts
contexts = [match.metadata["text"] for match in res.matches]
# build the multiple contexts string
context_str = construct_context(contexts=contexts)
# create our retrieval augmented prompt
payload = create_payload(question, context_str)
# make prediction
out = predictor.predict(payload, custom_attributes='accept_eula=true')
return out[0]["generation"]["content"]
rag_query("Does SageMaker support spot instances?")
首先,我们使用这个基于 sagemaker 的笔记本部署了 Llama-2 Jumpstart 模型和 Huggingface 的句子嵌入模型。现在,我们在 AWS Lambda 中执行此操作,我们执行以下操作:从 s3 存储桶导入 CSV 文件,然后在提供 pinecone 凭证后,我们从 jupyter 笔记本中获取代码片段并复制函数进入拉姆达。将相同的代码从笔记本复制到 Lambda 后,我们在根据笔记本中的代码片段编写 Lambda 处理函数时遇到了麻烦。
我们期望的是 Llama 根据我们提供的查询做出响应,并且我们期望查询使用 RAG 方法并为我们提供适合我们提供代码的文档的响应。
感谢您的所有帮助。非常感谢。
您能否添加更多具体信息来说明您遇到的错误? (例如,错误消息、API 失败、调用堆栈等)。