by Guest » 18 Feb 2025, 13:59
Ich versuche, Lama3.1
8B zu finanzieren. Ich verwende 4 A10g GPU mit jeweils 24 GB.
Code: Select all
from accelerate import PartialState
device_string = PartialState().process_index
torch_dtype = torch.float16
attn_implementation = "eager"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch_dtype,
bnb_4bit_use_double_quant=True
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
base_model,
quantization_config=bnb_config,
# device_map={'':torch.cuda.current_device()},
# device_map="auto",
device_map={'':device_string},
attn_implementation=attn_implementation
)
# LoRA config
peft_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)
training_arguments = TrainingArguments(
output_dir=new_model,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
gradient_accumulation_steps=2,
# gradient_checkpointing_kwargs={'use_reentrant':False},
optim="paged_adamw_32bit",
num_train_epochs=5,
evaluation_strategy="steps",
eval_steps=0.2,
logging_steps=1,
warmup_steps=10,
logging_strategy="steps",
learning_rate=2e-4,
fp16=False,
bf16=False,
group_by_length=True,
report_to="tensorboard"
)
trainer = SFTTrainer(
model=model,
train_dataset=dataset["train"],
eval_dataset=dataset["test"],
peft_config=peft_config,
# max_seq_length=512,
# dataset_text_field="text",
tokenizer=tokenizer,
args=training_arguments,
# packing= False,
)
trainer.train()
< /code>
Ich starte es mit < /p>
python -m torch.distributed.launch trainer.py
resultiert jedoch zu einem Speicher
Problem beim Laden des Basismodells. kann Multi -GPU zur Finetuning verwenden?. Wenn ja, welche Konfigurationsänderung ist erforderlich.
Ich versuche, Lama3.1 [b] 8B [/b] zu finanzieren. Ich verwende 4 A10g GPU mit jeweils 24 GB.[code]from accelerate import PartialState
device_string = PartialState().process_index
torch_dtype = torch.float16
attn_implementation = "eager"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch_dtype,
bnb_4bit_use_double_quant=True
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
base_model,
quantization_config=bnb_config,
# device_map={'':torch.cuda.current_device()},
# device_map="auto",
device_map={'':device_string},
attn_implementation=attn_implementation
)
# LoRA config
peft_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)
training_arguments = TrainingArguments(
output_dir=new_model,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
gradient_accumulation_steps=2,
# gradient_checkpointing_kwargs={'use_reentrant':False},
optim="paged_adamw_32bit",
num_train_epochs=5,
evaluation_strategy="steps",
eval_steps=0.2,
logging_steps=1,
warmup_steps=10,
logging_strategy="steps",
learning_rate=2e-4,
fp16=False,
bf16=False,
group_by_length=True,
report_to="tensorboard"
)
trainer = SFTTrainer(
model=model,
train_dataset=dataset["train"],
eval_dataset=dataset["test"],
peft_config=peft_config,
# max_seq_length=512,
# dataset_text_field="text",
tokenizer=tokenizer,
args=training_arguments,
# packing= False,
)
trainer.train()
< /code>
Ich starte es mit < /p>
python -m torch.distributed.launch trainer.py[/code]
resultiert jedoch zu einem Speicher [url=viewtopic.php?t=11587]Problem[/url] beim Laden des Basismodells. kann Multi -GPU zur Finetuning verwenden?. Wenn ja, welche Konfigurationsänderung ist erforderlich.