import logging, os, pickle, torch, time
import streamlit as st
from streamlit_extras.add_vertical_space import add_vertical_space
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.embeddings.openai import OpenAIEmbeddings
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel, AutoConfig, Pipeline
from dotenv import load_dotenv
# skip for streamlit process
path = "instructor_xl"
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModel.from_pretrained(path)
token_texts = tokenizer(chunks, return_tensors="pt", padding=True, truncation=True)
model = model.to(device)
embeddings = model(**token_texts)
duration = time.time() - start
logging.info(f"time to embed the books on {device}: {duration}")
VectorStore = FAISS.from_texts(chunks, embeddings)
所以路径是本地路径,使用Autotokenizer和AutoModel,我可以批量运行它。但是,由于错误
FAISS.from_texts
,embeddings
无法接受参数 no attribute embed_document
。
我应该从头开始构建 FAISS 还是任何其他库可以帮助我?
由于您通过 Langchain 集成使用 Faiss,因此它需要一个包装器 Embeddings 模型类,而不是直接的模型。您可以做的是创建一个 CustomEmbeddings 模型类并将您的模型放入其中。
如果您尝试使用由 Hugginfaces 托管的模型,例如 setence_transformers 上的模型,您可以使用 HuggingFaceEmbeddings 类。
from langchain_community.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
VectorStore = FAISS.from_texts(chunks, embeddings)