Skip to content

Commit

Permalink
Merge branch 'main' of github.com:jondurbin/llama-recipes
Browse files Browse the repository at this point in the history
  • Loading branch information
jondurbin committed Sep 1, 2023
2 parents b7a36d6 + 1437b91 commit 516667e
Show file tree
Hide file tree
Showing 7 changed files with 17 additions and 17 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ If you are interested in running full parameter fine-tuning on the 70B model, yo

```bash

torchrun --nnodes 1 --nproc_per_node 8 llama_finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /patht_of_model_folder/70B --batch_size_training 1 --micro_batch_size 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
torchrun --nnodes 1 --nproc_per_node 8 llama_finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /patht_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned

```

Expand Down
2 changes: 1 addition & 1 deletion configs/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class train_config:
low_cpu_fsdp: bool=False
run_validation: bool=True
batch_size_training: int=4
gradient_accumulation_steps: int=1
num_epochs: int=3
num_workers_dataloader: int=1
lr: float=1e-4
Expand All @@ -21,7 +22,6 @@ class train_config:
mixed_precision: bool=True
val_batch_size: int=1
dataset = "samsum_dataset"
micro_batch_size: int=4
peft_method: str = "lora" # None , llama_adapter, prefix
use_peft: bool=False
output_dir: str = "PATH/to/save/PEFT/model"
Expand Down
4 changes: 2 additions & 2 deletions docs/multi_gpu.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ If you are interested in running full parameter fine-tuning on the 70B model, yo

```bash

torchrun --nnodes 1 --nproc_per_node 8 llama_finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /patht_of_model_folder/70B --batch_size_training 1 --micro_batch_size 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned
torchrun --nnodes 1 --nproc_per_node 8 llama_finetuning.py --enable_fsdp --low_cpu_fsdp --pure_bf16 --model_name /patht_of_model_folder/70B --batch_size_training 1 --dist_checkpoint_root_folder model_checkpoints --dist_checkpoint_folder fine-tuned

```

Expand Down Expand Up @@ -126,6 +126,7 @@ model_name: str="PATH/to/LLAMA 2/7B"
enable_fsdp: bool= False
run_validation: bool=True
batch_size_training: int=4
gradient_accumulation_steps: int=1
num_epochs: int=3
num_workers_dataloader: int=2
lr: float=2e-4
Expand All @@ -135,7 +136,6 @@ use_fp16: bool=False
mixed_precision: bool=True
val_batch_size: int=4
dataset = "samsum_dataset" # alpaca_dataset, grammar_dataset
micro_batch_size: int=1
peft_method: str = "lora" # None , llama_adapter, prefix
use_peft: bool=False
output_dir: str = "./ft-output"
Expand Down
6 changes: 3 additions & 3 deletions docs/single_gpu.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ To run fine-tuning on a single GPU, we will make use of two packages

1- [PEFT](https://huggingface.co/blog/peft) methods and in specific using HuggingFace [PEFT](https://github.com/huggingface/peft)library.

2- [BitandBytes](https://github.com/TimDettmers/bitsandbytes) int8 quantization.
2- [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) int8 quantization.

Given combination of PEFT and Int8 quantization, we would be able to fine_tune a Llama 2 7B model on one consumer grade GPU such as A10.

Expand All @@ -21,7 +21,7 @@ pip install -r requirements.txt

## How to run it?

Get access to a machine with one GPU or if using a multi-GPU macine please make sure to only make one of them visible using `export CUDA_VISIBLE_DEVICES=GPU:id` and run the following. It runs by default with `samsum_dataset` for summarization application.
Get access to a machine with one GPU or if using a multi-GPU machine please make sure to only make one of them visible using `export CUDA_VISIBLE_DEVICES=GPU:id` and run the following. It runs by default with `samsum_dataset` for summarization application.


```bash
Expand Down Expand Up @@ -82,6 +82,7 @@ model_name: str="PATH/to/LLAMA 2/7B"
enable_fsdp: bool= False
run_validation: bool=True
batch_size_training: int=4
gradient_accumulation_steps: int=1
num_epochs: int=3
num_workers_dataloader: int=2
lr: float=2e-4
Expand All @@ -91,7 +92,6 @@ use_fp16: bool=False
mixed_precision: bool=True
val_batch_size: int=4
dataset = "samsum_dataset" # alpaca_dataset,grammar_dataset
micro_batch_size: int=1
peft_method: str = "lora" # None , llama_adapter, prefix
use_peft: bool=False
output_dir: str = "./ft-output"
Expand Down
5 changes: 1 addition & 4 deletions llama_finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,6 @@ def main(**kwargs):
clear_gpu_cache(local_rank)
setup_environ_flags(rank)

# Calculate gradient accumulation steps
gradient_accumulation_steps = train_config.batch_size_training // train_config.micro_batch_size

# Load the pre-trained model and setup its configuration
if train_config.enable_fsdp and train_config.low_cpu_fsdp:
"""
Expand Down Expand Up @@ -240,7 +237,7 @@ def main(**kwargs):
tokenizer,
optimizer,
scheduler,
gradient_accumulation_steps,
train_config.gradient_accumulation_steps,
train_config,
fsdp_config if train_config.enable_fsdp else None,
local_rank if train_config.enable_fsdp else None,
Expand Down
1 change: 1 addition & 0 deletions scripts/spellcheck_conf/wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1121,3 +1121,4 @@ summarization
xA
Sanitization
tokenization
bitsandbytes
14 changes: 8 additions & 6 deletions utils/train_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
with MemoryTrace() as memtrace: # track the memory usage
model.train()
total_loss = 0.0
for step, batch in enumerate(tqdm(train_dataloader,colour="blue", desc=f"Training Epoch{epoch}")):
total_length = len(train_dataloader)//gradient_accumulation_steps
pbar = tqdm(colour="blue", desc=f"Training Epoch: {epoch}", total=total_length)
for step, batch in enumerate(train_dataloader):
for key in batch.keys():
if train_config.enable_fsdp:
batch[key] = batch[key].to(local_rank)
Expand All @@ -99,17 +101,17 @@ def train(model, train_dataloader,eval_dataloader, tokenizer, optimizer, lr_sche
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
pbar.update(step//gradient_accumulation_steps)
else:
# regular backpropagation when fp16 is not used
loss.backward()
if (step + 1) % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
optimizer.step()
optimizer.zero_grad()
if train_config.enable_fsdp:
if rank==0:
print(f"\n step {step} is completed and loss is {loss.detach().float()}")
else:
print(f"\n step {step} is completed and loss is {loss.detach().float()}")
pbar.update(step//gradient_accumulation_steps)

pbar.set_description(f"Training Epoch: {epoch}/{train_config.num_epochs}, step {step}/{len(train_dataloader)} completed (loss: {loss.detach().float()})")

epoch_end_time = time.perf_counter()-epoch_start_time
epoch_times.append(epoch_end_time)
# Reducing total_loss across all devices if there's more than one CUDA device
Expand Down

0 comments on commit 516667e

Please sign in to comment.