有没有办法让langchain中的LLM仅根据提供的上下文(pdf)回答问题?

问题描述 投票:0回答:1

iam 尝试使用 langchain 与多个 pdf 系统进行聊天,但如果我向机器人询问提供的 pdf 中的问题,它会根据 llm 预训练的知识进行回答,我希望它只回答提供的上下文中的问题,这里是代码

    import streamlit as st
    from dotenv import load_dotenv
    from PyPDF2 import PdfReader
    from langchain.text_splitter import CharacterTextSplitter
    from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
    from langchain.vectorstores import faiss
    from langchain.chat_models import ChatOpenAI
    from langchain.memory import ConversationBufferMemory
    from langchain.chains import ConversationalRetrievalChain
    from HtmlTemplates import css1,user_template,bot_template
    from langchain.llms import huggingface_hub 

    def get_pdf_text(pdf_docs):
        text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
        return text

    def get_text_chunks(text):
        text_splitter = CharacterTextSplitter(
         separator="\n",
         chunk_size=1000,
         chunk_overlap=200,
         length_function=len
      )
      chunks = text_splitter.split_text(text)
        return chunks

    def get_vectorstore(text_chunks):
        embeddings = OpenAIEmbeddings()
        vectorstore = faiss.FAISS.from_texts(texts=text_chunks, embedding=embeddings)

        return vectorstore



    def get_conversation_chain(vectorstore):
        llm = huggingface_hub.HuggingFaceHub(repo_id ="google/flan-t5-xxl", model_kwargs = { "temperature"         :0.5 , "max_length" :512,})
        memory = ConversationBufferMemory(
            memory_key='chat_history', return_messages=True)
    conversation_chain = ConversationalRetrievalChain.from_llm(
         verbose=False,
         chain_type="stuff",
         llm=llm,
         retriever=vectorstore.as_retriever(),
         memory=memory)

       return conversation_chain



def handle_userinput(user_question):
    response = st.session_state.conversation({'question': user_question})
    st.session_state.chat_history = response['chat_history']
    for i, message in enumerate(st.session_state.chat_history):
        if i % 2 == 0:
            st.write(user_template.replace(
                "{{MSG}}", message.content), unsafe_allow_html=True)
        else:
            st.write(bot_template.replace(
                "{{MSG}}", message.content), unsafe_allow_html=True)  
    

def main():
    load_dotenv()
    st.set_page_config(page_title="Educational chatbot",
                       page_icon=":books:" )
    st.write(css1, unsafe_allow_html=True)
    if "text_chunks" not in st.session_state:
        st.session_state.text_chunks =None
    if "conversation" not in st.session_state:
        st.session_state.conversation = None
    if "chat_history" not in st.session_state:
        st.session_state.chat_history = None

    st.header("chatbot")
    user_question = st.text_input("Ask your question about your PDFs here:")
    if user_question:
        handle_userinput(user_question) 
    else:
        st.write("please enter question")    
    #with st.sidebar:
    st.subheader("Your documents")
    pdf_docs = st.file_uploader(
            "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
    if st.button("Process")and pdf_docs !=0:
        with st.spinner("Processing"):
                
                # get text from pdf
                raw_text = get_pdf_text(pdf_docs)
                
                # text chunks
                text_chunks = get_text_chunks(raw_text)
                
                #vector store
                vectorstoree = get_vectorstore(text_chunks)

                #create conversation chain
                st.session_state.conversation = get_conversation_chain(vectorstoree)                        
               
if __name__ == '__main__':
    main()

是预训练嵌入在这个问题中的作用还是仅来自 llm 的预训练知识

your text

python pdf nlp chatbot langchain
1个回答
0
投票

根据我的理解,我只是将您的代码修改如下。基本上你想要的是查询 VectorStore 然后用

ConversationalRetrievalChain

生成答案
from langchain_openai.embeddings import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

from langchain_community.vectorstores import FAISS
vectorstore = FAISS.from_texts(
    ["harry potter's owl is in the castle."], embedding)

from langchain_community.llms import HuggingFaceEndpoint
llm = HuggingFaceEndpoint(
    repo_id="google/flan-t5-xxl", max_new_tokens=512, temperature=0.5)

from langchain_core.prompts import PromptTemplate
template = (
    "Combine the chat history and follow up question into "
    "a standalone question. Chat History: {chat_history}"
    "Follow up question: {question}"
)
prompt = PromptTemplate.from_template(template)

from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    memory=memory,
    verbose=True,
)

query = input("Please input your query: ")
print(conversation_chain.invoke(query))
© www.soinside.com 2019 - 2024. All rights reserved.