我想用“dxiao/requirements-ner-id”数据集训练“flax-community/t5-large-wikisplit”模型。 (只是为了一些实验)
我认为我的一般程序不正确,但我不知道如何走得更远。 我的代码:
加载分词器和模型:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel
checkpoint = "flax-community/t5-large-wikisplit"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).cuda()
加载我要训练的数据集:
from datasets import load_dataset
raw_dataset = load_dataset("dxiao/requirements-ner-id")
raw_dataset 看起来像这样 ['id', 'tokens', 'tags', 'ner_tags']
我想把句子作为句子而不是标记。
def tokenToString(tokenarray):
string = tokenarray[0]
for x in tokenarray[1:]:
string += " " + x
return string
def sentence_function(example):
return {"sentence" : tokenToString(example["tokens"]),
"simplefiedSentence" : tokenToString(example["tokens"]).replace("The", "XXXXXXXXXXX")}
wikisplit_req_set = raw_dataset.map(sentence_function)
wikisplit_req_set
我尝试重组数据集,使其看起来像 wikisplit 数据集:
simple1dataset = wikisplit_req_set.remove_columns(['id', 'tags', 'ner_tags', 'tokens']);
complexdataset = wikisplit_req_set.remove_columns(['id', 'tags', 'ner_tags', 'tokens']);
complexdataset["train"] = complexdataset["train"].add_column("simple_sentence_1",simple1dataset["train"]["sentence"]).add_column("simple_sentence_2",simple1dataset["train"]["simplefiedSentence"])
complexdataset["test"] = complexdataset["test"].add_column("simple_sentence_1",simple1dataset["test"]["sentence"]).add_column("simple_sentence_2",simple1dataset["test"]["simplefiedSentence"])
complexdataset["validation"] = complexdataset["validation"].add_column("simple_sentence_1",simple1dataset["validation"]["sentence"]).add_column("simple_sentence_2",simple1dataset["validation"]["simplefiedSentence"])
trainingDataSet = complexdataset.rename_column("sentence", "complex_sentence")
trainingDataSet
标记它:
def tokenize_function(example):
model_inputs = tokenizer(example["complex_sentence"],truncation=True, padding=True)
targetS1 = tokenizer(example["simple_sentence_1"],truncation=True, padding=True)
targetS2 = tokenizer(example["simple_sentence_2"],truncation=True, padding=True)
model_inputs['simple_sentence_1'] = targetS1['input_ids']
model_inputs['simple_sentence_2'] = targetS2['input_ids']
model_inputs['decoder_input_ids'] = targetS2['input_ids']
return model_inputs
tokenized_datasets = trainingDataSet.map(tokenize_function, batched=True)
tokenized_datasets=tokenized_datasets.remove_columns("complex_sentence")
tokenized_datasets=tokenized_datasets.remove_columns("simple_sentence_1")
tokenized_datasets=tokenized_datasets.remove_columns("simple_sentence_2")
tokenized_datasets=tokenized_datasets.remove_columns("simplefiedSentence")
tokenized_datasets
数据加载器:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
data_collator
培训:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, TrainingArguments, EvalPrediction, DataCollatorWithPadding, Trainer
bleu = evaluate.load("bleu")
training_args = Seq2SeqTrainingArguments(
output_dir = "/",
log_level = "error",
num_train_epochs = 0.25,
learning_rate = 5e-4,
lr_scheduler_type = "linear",
warmup_steps = 50,
optim = "adafactor",
weight_decay = 0.01,
per_device_train_batch_size = 1,
per_device_eval_batch_size = 1,
gradient_accumulation_steps = 16,
evaluation_strategy = "steps",
eval_steps = 50,
predict_with_generate=True,
generation_max_length = 128,
save_steps = 500,
logging_steps = 10,
push_to_hub = False,
auto_find_batch_size=True
)
trainer = Seq2SeqTrainer(
model,
training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=bleu,
)
trainer.train()
问题是,我不明白模型如何知道预期值以及如何计算损失。有人可以给我一些想法吗?
我希望有人能帮助我理解我自己的代码,因为 Hugging Face 的文档对我的帮助不够。也许有人有一些代码示例或其他东西。我不完全理解如何微调模型以及如何获得模型期望训练它的参数。我也不明白训练是如何进行的以及参数是做什么的。