如何为 TheBloke/Llama-2-13B-chat-GPTQ 使用两个 8gb GPU？

Question

我正在使用 TheBloke/Llama-2-13B-chat-GPTQ 开发文档聊天，但 TheBloke/Llama-2-7B-chat-GPTQ 无法使用 8GB GPU 给出结果，它需要 16GB GPU 内存。我有 2 个 8-8gb GPU 如何使用这两个 GPU 来运行这个模型。

参考代码----

import os
import torch
from auto_gptq import AutoGPTQForCausalLM
from langchain import HuggingFacePipeline, PromptTemplate
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from pdf2image import convert_from_path
from transformers import AutoTokenizer, TextStreamer, pipeline, AutoModel
from langchain.llms import CTransformers


DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"


story_images = convert_from_path("pdfs/doc.pdf", dpi=88)
story_images[0]



rmdir -rf "db"

loader = PyPDFDirectoryLoader("pdfs")
docs = loader.load()
len(docs)




# Set the path to the directory where you've stored the downloaded model
model_directory = "/home/raj/Documents/chat_with_doc/instructor-large"

# Load the tokenizer and model from the directory
tokenizer = AutoTokenizer.from_pretrained(model_directory)
model = AutoModel.from_pretrained(model_directory)

# Set the device for the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


embeddings = HuggingFaceInstructEmbeddings(
    model_name="/home/raj/Documents/chat_with_doc/instructor-large", model_kwargs={"device": DEVICE}
)


text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
texts = text_splitter.split_documents(docs)
len(texts)

%%time
db = Chroma.from_documents(texts, embeddings, persist_directory="db")



# """## Llama 2 13B"""

model_name_or_path = "/home/raj/Documents/chat_with_doc/Llama-2-13b-Chat-GPTQ"
model_basename = "model"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

model = AutoGPTQForCausalLM.from_quantized(
    model_name_or_path,
    revision="gptq-4bit-128g-actorder_True",
    model_basename=model_basename,
    use_safetensors=True,
    trust_remote_code=True,
    inject_fused_attention=False,
    device=DEVICE,
    quantize_config=None,
)

# !nvidia-smi

DEFAULT_SYSTEM_PROMPT = """
You are a helpful """.strip()


def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
    return f"""
[INST] <<SYS>>
{system_prompt}
<</SYS>>

{prompt} [/INST]
""".strip()

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

text_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=1024,
    temperature=0,
    top_p=0.95,
    repetition_penalty=1.15,
    streamer=streamer,
)


llm = HuggingFacePipeline(pipeline=text_pipeline, model_kwargs={"temperature": 0})

SYSTEM_PROMPT = "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer."

template = generate_prompt(
    """
{context}

Question: {question}
""",
    system_prompt=SYSTEM_PROMPT,
)

prompt = PromptTemplate(template=template, input_variables=["context", "question"])

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt},
)



result = qa_chain(
    "summerize the given document."
)

提前致谢。

我想使用带有两个 8-8GB GPU 处理器的“Llama-2-13b-Chat-GPTQ”。

Answer 1

卖掉两个 8GB GPU 并购买一个 16GB 型号。

作为一名前 GPU 设计师，我怀疑矩阵乘法依赖于内存带宽以及整体内存池，并且它不是一个可以在不造成巨大性能损失的情况下划分到多个 GPU 的任务。

可能有一些模型是许多较小矩阵的集合。此类模型可以分区以并行运行，但单片模型不能很好地扩展到多 GPU。

如何为 TheBloke/Llama-2-13B-chat-GPTQ 使用两个 8gb GPU？

问题描述投票：0回答：1

1个回答

最新问题

如何为 TheBloke/Llama-2-13B-chat-GPTQ 使用两个 8gb GPU？

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1