我正在使用 IMDb 数据集和基于 GPT-2 的模型进行情感分析。这是一个了解 PEFT 和 LORA 并获得一些 Huggingface 库经验的玩具项目。
这是我尝试过的:
from datasets import load_dataset
splits = ["train", "test"]
ds = {split: ds for split, ds in zip(splits, load_dataset("imdb", split=splits))}
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# GPT-2 Tokenizer doesn't have a padding token.
tokenizer.pad_token = tokenizer.eos_token
def preprocess_function(examples):
"""Preprocess the imdb dataset by returning tokenized examples."""
tokens = tokenizer(examples['text'],padding='max_length',truncation=True)
return tokens
tokenized_ds = {}
for split in splits:
tokenized_ds[split] = ds[split].map(preprocess_function, batched=True)
model2 = AutoModelForSequenceClassification.from_pretrained(
"gpt2",
num_labels=2,
id2label={0: "NEGATIVE", 1: "POSITIVE"}, # For converting predictions to strings
label2id={"NEGATIVE": 0, "POSITIVE":1},
)
model2.config.pad_token_id = model.config.eos_token_id
from peft import LoraConfig
from peft import get_peft_model
lora_config = LoraConfig("lora_gpt2", fan_in_fan_out=True,)
lora_model = get_peft_model(model2, lora_config)
trainer_lora = Trainer(
model=lora_model,
args=TrainingArguments(
output_dir="./data/sentiment_analysis2",
learning_rate=2e-3,
# Reduce the batch size if you don't have enough memory
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=5,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
),
train_dataset=tokenized_ds["train"],
eval_dataset=tokenized_ds["test"],
tokenizer=tokenizer,
data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
compute_metrics=compute_metrics,
)
trainer_lora.train()
当我运行此代码时,我收到以下错误并且在调试问题时遇到一些困难:
File /opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:3018, in PreTrainedTokenizerBase.pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose)
3016 # The model's main input name, usually `input_ids`, has be passed for padding
3017 if self.model_input_names[0] not in encoded_inputs:
-> 3018 raise ValueError(
3019 "You should supply an encoding or a list of encodings to this method "
3020 f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
3021 )
3023 required_input = encoded_inputs[self.model_input_names[0]]
3025 if required_input is None or (isinstance(required_input, Sized) and len(required_input) == 0):
ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['label']
我不确定如何解决这个问题,并且在网上找不到很多相关示例,希望 SO 社区能够提供帮助。
LoRA 模型将预期的列名称从标签更改为标签