[ {
"topic": "Мы были достаточно цивилизованны, чтобы построить машину, но слишком примитивны, чтобы ею пользоваться». (Карл Краус)",
"text": "Высказывание Карла Крауса, австрийского писателя, о том, что «мы были достаточно цивилизованны, чтобы построить машину... }]
有代码:
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead, Trainer, TrainingArguments
model_name = "tinkoff-ai/ruDialoGPT-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelWithLMHead.from_pretrained(model_name)
import json
def prepare_data(filepath):
with open(filepath, "r", encoding="utf-8") as f:
data = json.load(f)
prompts = [example["topic"] for example in data]
texts = [example["text"] for example in data]
inputs = []
for i in range(len(prompts)):
inputs.append(prompts[i] + texts[i])
return inputs
train_inputs = prepare_data("train.json")
test_inputs = prepare_data("test.json")
from transformers import TextDataset, DataCollatorForLanguageModeling
train_dataset = TextDataset(tokenizer=tokenizer, file_path="train.json", block_size=128)
test_dataset = TextDataset(tokenizer=tokenizer, file_path="test.json", block_size=128)
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=False,
)
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
output_dir="./models",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=2,
weight_decay=0.01,
push_to_hub=False,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
data_collator=data_collator,
)
trainer.train()
但是它需要很多内存,而且我无法在 Kaggle 中对其进行微调。输出 >19gb。
这是正常现象还是我该如何解决?
[2501/7802 59:11 < 26:52, 2.06 it/s, Epoch 0.86/3] every 10 minutes it takes 2-3 GB, is it normal?