我的代码是否使用 torch.distributed.launch 正确运行

Question

当我使用这些参数运行我的代码时：python -m torch.distributed.launch --nproc_per_node=8 main3.py，尝试使用 8 个 gpus，这是否会创建我的代码的 8 个单独运行并导致其他 7 个 gpus训练进度丢失？如果是这样，为什么我只得到一个进度条，时间减少了，但打印的文件名也增加了 8 倍？我的代码

import os
import math
import numpy as np
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import random


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batch_size = 32  # Batch size for training.
epochs = 3  # Number of epochs to train for.
chunk_size = 250000  # Number of samples to train on.
data_path = "data.txt"

start_token = "<start>"
end_token = "<end>"

# Read the number of lines in the data file
with open(data_path, "r", encoding="utf-8") as f:
    num_lines = sum(1 for line in f)

# Calculate the number of chunks needed
num_chunks = math.ceil(num_lines / chunk_size)

# Use GPT-2's tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2_model")

# Add the start and end tokens to the tokenizer
tokenizer.add_tokens([start_token, end_token])

# Load GPT-2's pre-trained model
model = GPT2LMHeadModel.from_pretrained("./gpt2_model")

# Resize the model's token embeddings to include the new tokens
model.resize_token_embeddings(len(tokenizer))

for chunk in range(num_chunks):
    # Clear previous data
    input_texts = []
    filename = ""
    # ...

    with open(data_path, "r", encoding="utf-8") as f:
        # Skip lines that have already been read
        for _ in range(chunk * chunk_size):
            next(f)

        # Read the lines for this chunk
        for _ in range(chunk_size):
            line = f.readline().strip()
            if not line:
                break

            input_text, target_text = line.split(":")
            input_texts.append(start_token + input_text + ":" + target_text + end_token)
    filename = "processed_data.txt"
    # Save the processed data into a new file
    with open(str(chunk) + filename, "w", encoding="utf-8") as f:
        for text in input_texts:
            f.write(text + "\n")
    print("file created:" + str(chunk) + filename)
    # Create the dataset using the tokenizer
    dataset = TextDataset(tokenizer=tokenizer, file_path=str(chunk) + filename, block_size=128)

    # Create a data collator for language modeling
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # Define training arguments
    training_args = TrainingArguments(
    output_dir="./gpt2_model",
    overwrite_output_dir=True,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    save_steps=10_000,
    save_total_limit=2,
    # Add these arguments to enable multi-GPU training
)

    # Create a Trainer instance with the model, dataset, and training arguments
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    # Train the model on this chunk
    trainer.train()
    model.save_pretrained("./gpt2_model")
    tokenizer.save_pretrained("./gpt2_model")

我的代码是否使用 torch.distributed.launch 正确运行

问题描述投票：0回答：0

最新问题

我的代码是否使用 torch.distributed.launch 正确运行

问题描述 投票：0回答：0

最新问题

问题描述投票：0回答：0