Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:3! in DPOTrainer with ec2 G5 12X Large

import osimport torchfrom datasets import load_dataset, Datasetfrom transformers import (    BitsAndBytesConfig,    AutoTokenizer,    TrainingArguments,)from peft import AutoPeftModelForCausalLMfrom trl import DPOTrainerfrom peft import LoraConfighf_auth = ""peft_model_path = 'test/'dataset = load_dataset("test_classification",)print("Dataset loaded:", dataset)def format_instruction(vignette: str):    return f"""<s>[INST]{vignette.strip()} Generate given Vignette class and explain the reason for class.[/INST] """.strip()def generate_instruction_dataset(data_point):    return {"chosen": data_point["chosen"],"rejected": data_point["rejected"],"prompt": format_instruction(data_point["prompt"])    }def process_dataset(data: Dataset):    return (        data.shuffle(seed=42)        .map(generate_instruction_dataset)    )dataset = process_dataset(dataset)print("Dataset processed:", dataset)compute_dtype = getattr(torch, "float16")bnb_config = BitsAndBytesConfig(    load_in_4bit=True,    llm_int8_threshold=6.0,    llm_int8_has_fp16_weight=False,    bnb_4bit_compute_dtype=compute_dtype,    bnb_4bit_use_double_quant=True,    bnb_4bit_quant_type="nf4",)print("Loading base model:")model = AutoPeftModelForCausalLM.from_pretrained(    peft_model_path,  # location of saved SFT model    device_map="auto",    quantization_config=bnb_config,)print("Loading reward model:")model_ref = AutoPeftModelForCausalLM.from_pretrained(    peft_model_path,  # same model as the main one    device_map="auto",    quantization_config=bnb_config,)print("Loading tokenizer:")tokenizer = AutoTokenizer.from_pretrained(    peft_model_path, use_auth_token=hf_auth, trust_remote_code=True, device_map="auto")output_dir = "dpo/output/"training_args = TrainingArguments(    output_dir=output_dir,    remove_unused_columns=True,    per_device_train_batch_size=4,)print("Lora config added")peft_config = LoraConfig(    lora_alpha=16,    lora_dropout=0.1,    r=64,    bias="none",    task_type="CAUSAL_LM",)print("DPO trainer initialized:")dpo_trainer = DPOTrainer(    model,    model_ref,    args=training_args,    beta=0.1,    train_dataset=dataset['train'],    # eval_dataset=eval_dataset,    tokenizer=tokenizer,    peft_config=peft_config,    max_length=1024,    max_prompt_length=512,)torch.set_grad_enabled(True)print("DPO trainer started:")dpo_trainer.train()print("Training done")

I am use G5 12X Large instance for this training it has following GPU'sGPU 0: NVIDIA A10GGPU 1: NVIDIA A10GGPU 2: NVIDIA A10GGPU 3: NVIDIA A10G

But with start of dpo_trainer.train() following error will occurs.

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:3!