Skip to content

fix tp only bug#3908

Merged
SunMarc merged 1 commit into
huggingface:mainfrom
sywangyi:tp_only
Jan 14, 2026
Merged

fix tp only bug#3908
SunMarc merged 1 commit into
huggingface:mainfrom
sywangyi:tp_only

Conversation

@sywangyi

Copy link
Copy Markdown
Contributor

Signed-off-by: Wang, Yi <yi.a.wang@intel.com>
@sywangyi

Copy link
Copy Markdown
Contributor Author
import argparse

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments

from utils import get_dataset


MODEL_ID = "NousResearch/Hermes-3-Llama-3.1-8B"


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--sequence-length", type=int, default=1024)
    parser.add_argument("--checkpoint-frequency", type=int, default=100)
    parser.add_argument("--model-name", type=str, default=MODEL_ID)
    parser.add_argument("--save-dir", type=str, default=f"./accelerate-nd-parallel-{MODEL_ID.split('/')[-1]}")
    parser.add_argument("--device-type", type=str, default="auto")
    return parser.parse_args()


def main():
    # If ParallelismConfig is not initialized with __init__, it reads from env vars
    # which were set by using config
    args = parse_args()
    #    pc = ParallelismConfig()
    if args.device_type == "auto":
        args.device_type = torch.accelerator.current_accelerator().type

    model_kwargs = {}
    model_kwargs["tp_plan"] = "auto"

    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    model = AutoModelForCausalLM.from_pretrained(args.model_name, use_cache=False, **model_kwargs)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    packed_dataset = get_dataset(tokenizer, args.sequence_length)

    training_args = TrainingArguments(
        output_dir=args.save_dir,
        num_train_epochs=1,
        #        parallelism_config=pc,
        per_device_train_batch_size=1,
        logging_steps=5,
        save_steps=args.checkpoint_frequency,
        learning_rate=5e-5,
        remove_unused_columns=False,
        max_steps=200,
        bf16=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        processing_class=tokenizer,
        train_dataset=packed_dataset,
    )

    trainer.train()
    trainer.save_model()


if __name__ == "__main__":
    main()

torchrun --nproc_per_node=4 trainer_tp.py --sequence-length 1024

@SunMarc SunMarc left a comment

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks a lot ! Just a nit

Comment thread src/accelerate/accelerator.py
@HuggingFaceDocBuilderDev

Copy link
Copy Markdown

The docs for this PR live here. All of your documentation changes will be reflected on that endpoint. The docs are available until 30 days after the last update.

@SunMarc SunMarc merged commit 38dadd9 into huggingface:main Jan 14, 2026
23 of 25 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants