将 google flan T5 模型转换为 onnx 时出错

问题描述 投票:0回答:1

我希望将从 Hugging Face 下载的 flan-T5 模型转换为 onnx 格式,并用相同的格式进行推理。

我的输入数据是疾病症状,预期输出是疾病名称

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import onnx

# Set the device to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xl").to(device)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl")

  

# Export the model to ONNX format
onnx_path = "flan-t5-xl.onnx"
dummy_input = tokenizer("What's the disease name in this text: Example text", return_tensors="pt", padding=True).to(device)
dummy_input_ids = dummy_input["input_ids"]
dummy_attention_mask = dummy_input["attention_mask"]
dummy_decoder_input_ids = tokenizer("<pad>", return_tensors="pt").input_ids.to(device)

with torch.no_grad():
    torch.onnx.export(
        model,
        (dummy_input_ids, dummy_attention_mask, dummy_decoder_input_ids),
        onnx_path,
        opset_version=11,
        input_names=["input_ids", "attention_mask", "decoder_input_ids"],
        output_names=["output"],
        dynamic_axes={
            "input_ids": {0: "batch_size"},
            "attention_mask": {0: "batch_size"},
            "decoder_input_ids": {0: "batch_size"},
            "output": {0: "batch_size", 1: "sequence_length"},
        },
    )
print(f"Model saved to {onnx_path}")

# Inference using the ONNX model on GPU

import onnxruntime

onnx_model = onnxruntime.InferenceSession(onnx_path, providers=["CUDAExecutionProvider"]

)

InvalidGraph:[ONNXRuntimeError]:10:INVALID_GRAPH:从 flan-t5-xl.onnx 加载模型失败:这是无效模型。类型错误:节点 (/decoder/block.0/layer.0/SelfAttention/) 中运算符 (Min) 的输入参数 (/decoder/block.0/layer.0/SelfAttention/Sub_output_0) 的类型为“tensor(int64)”分钟)无效。

input_text = input("Enter Disease/Symptom Detail: ")
inputs = tokenizer(input_text, return_tensors="pt", padding=True).to(device)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
decoder_input_ids = tokenizer("<pad>", return_tensors="pt").input_ids.to(device)

onnx_inputs = {
    "input_ids": input_ids.cpu().numpy(),
    "attention_mask": attention_mask.cpu().numpy(),
    "decoder_input_ids": decoder_input_ids.cpu().numpy(),
}

onnx_output = onnx_model.run(None, onnx_inputs)[0]
decoded_output = tokenizer.decode(onnx_output[0], skip_special_tokens=True)

print('-' * 100)
print(f"Name of Disease based on Entered Text: {decoded_output}")
python pytorch huggingface-transformers onnx onnxruntime
1个回答
0
投票

使用 https://huggingface.co/datasets/bakks/flan-t5-onnx 代替。

要转换

google/flan-t5
,请参阅 https://huggingface.co/datasets/bakks/flan-t5-onnx/blob/main/exportt5.py

from pathlib import Path
import transformers as t
from transformers import AutoTokenizer, pipeline
from optimum.onnxruntime import ORTModelForSeq2SeqLM

# print out the version of the transformers library
print("transformers version:", t.__version__)



models = [
    #"google/flan-t5-small",
    #"google/flan-t5-base",
    #"google/flan-t5-large",
    "google/flan-t5-xl",
    "google/flan-t5-xxl",
]

for model_id in models:
    model_name = model_id.split("/")[1]
    onnx_path = Path("onnx/" + model_name)

    # load vanilla transformers and convert to onnx
    model = ORTModelForSeq2SeqLM.from_pretrained(model_id, from_transformers=True)
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    # save onnx checkpoint and tokenizer
    model.save_pretrained(onnx_path)
    tokenizer.save_pretrained(onnx_path)

然后再试一次:

import onnxruntime

onnx_model = onnxruntime.InferenceSession(
  onnx_path, providers=["CUDAExecutionProvider"]
)
© www.soinside.com 2019 - 2024. All rights reserved.