Quantcast
Channel: Active questions tagged python - Stack Overflow
Viewing all articles
Browse latest Browse all 13891

Runtime error with PyTorch's DDP in the diffusion model training

$
0
0

Issue

I am using CoDi which is multimodal Latent diffusion model. I am trying to remove the modules on images and video from CoDi and fine tune it with text-music pair data.

The training script(train.py) for generating music from a text prompt is as follows:

import torchimport torchaudioimport yamlimport matplotlib.pyplot as pltimport numpy as npimport osimport pandas as pdimport randomfrom torch.utils.data import Dataset, DataLoaderfrom core.models import codifrom core.models.ema import LitEmafrom core.models.common.get_optimizer import get_optimizerfrom argparse import ArgumentParserimport torch.distributed as distfrom torch.nn.parallel import DistributedDataParallel as DDPfrom torch.utils.data.distributed import DistributedSamplerfrom core.models.common.get_model import get_modelimport warningswarnings.filterwarnings('ignore')import torch.multiprocessing as mpdef load_yaml_config(filepath):    with open(filepath, 'r') as file:        return yaml.safe_load(file)class ConfigObject(object):    def __init__(self, dictionary):        for key in dictionary:            setattr(self, key, dictionary[key])def collate_fn(batch):    texts, audios = zip(*batch)    # 最大のオーディオ長を見つける    max_length = max(audio.shape[1] for audio in audios)    # パディング    audios_padded = torch.stack([torch.nn.functional.pad(audio, (0, max_length - audio.shape[1])) for audio in audios])    texts = torch.stack(texts)    return texts, audios_paddedsample_rate = 48000### Model Define===============================================================def model_define(x, c):    if x == "audio" and c == "text":        # AudioLDM        audioldm_cfg = load_yaml_config('configs/model/audioldm.yaml')        audioldm = ConfigObject(audioldm_cfg["audioldm_autoencoder"])        # CLIP        clip_cfg = load_yaml_config('configs/model/clip.yaml')        clip = ConfigObject(clip_cfg["clip_frozen"])        # Unet        unet_cfg = load_yaml_config('configs/model/openai_unet.yaml')        unet_cfg["openai_unet_codi"]["args"]["unet_audio_cfg"] = ConfigObject(unet_cfg["openai_unet_2d_audio"])        unet = ConfigObject(unet_cfg["openai_unet_codi"])        # CoDi        codi_cfg = load_yaml_config('configs/model/codi.yaml')        codi_cfg["codi"]["args"]["audioldm_cfg"] = audioldm        codi_cfg["codi"]["args"]["clip_cfg"] = clip        codi_cfg["codi"]["args"]["unet_config"] = unet        codi = ConfigObject(codi_cfg["codi"])        model = get_model()(codi)        return model    elif x == "text" and c == "audio":        # Optimus        optimus_cfg = load_yaml_config('configs/model/optimus.yaml')        optimus_cfg['optimus_vae']['args']['encoder'] = ConfigObject(optimus_cfg['optimus_bert_encoder'])        optimus_cfg['optimus_vae']['args']['encoder'].args['config'] = ConfigObject(optimus_cfg['optimus_bert_encoder']['args']['config'])        optimus_cfg['optimus_vae']['args']['decoder'] = ConfigObject(optimus_cfg['optimus_gpt2_decoder'])        optimus_cfg['optimus_vae']['args']['decoder'].args['config'] = ConfigObject(optimus_cfg['optimus_gpt2_decoder']['args']['config'])        optimus_cfg['optimus_vae']['args']['tokenizer_encoder'] = ConfigObject(optimus_cfg['optimus_bert_tokenizer'])        optimus_cfg['optimus_vae']['args']['tokenizer_decoder'] = ConfigObject(optimus_cfg['optimus_gpt2_tokenizer'])        optimus_cfg['optimus_vae']['args']['args'] = ConfigObject(optimus_cfg['optimus_vae']['args']['args'])        optimus = ConfigObject(optimus_cfg["optimus_vae"])        # CLAP        clap_cfg = load_yaml_config('configs/model/clap.yaml')        clap = ConfigObject(clap_cfg["clap_audio"])        # Unet        unet_cfg = load_yaml_config('configs/model/openai_unet.yaml')        unet_cfg["openai_unet_codi"]["args"]["unet_text_cfg"] = ConfigObject(unet_cfg["openai_unet_0dmd"])        unet = ConfigObject(unet_cfg["openai_unet_codi"])        # CoDi        codi_cfg = load_yaml_config('configs/model/codi.yaml')        codi_cfg["codi"]["args"]["optimus_cfg"] = optimus        codi_cfg["codi"]["args"]["clap_cfg"] = clap        codi_cfg["codi"]["args"]["unet_config"] = unet        codi = ConfigObject(codi_cfg["codi"])        model = get_model()(codi)        return model    # AutoKL    #autokl_cfg = load_yaml_config('configs/model/sd.yaml')    #autokl = ConfigObject(autokl_cfg["sd_autoencoder"])# Dataset=============================================================class MusicCaps(Dataset):    def __init__(self, csv_file, audio_dir, model, x, c, transform=None):        self.audio_dir = audio_dir        self.transform = transform        self.data = []        self.model = model        self.x = x        self.c = c        all_data = pd.read_csv(csv_file)        # Checks for the existence of audio files and adds only those data that exist to the list        for idx, row in all_data.iterrows():            audio_path = os.path.join(self.audio_dir, f"{row['ytid']}.wav")            if os.path.exists(audio_path):                self.data.append(row)    def __len__(self):        return len(self.data)    def __getitem__(self, idx):        row = self.data[idx]        caption = row['caption'] # raw text        audio_path = os.path.join(self.audio_dir, f"{row['ytid']}.wav")            waveform = torchaudio.load(audio_path) # raw audio(Tensor)        if self.x == "audio" and self.c == "text":            mel_latent = self.model.module.audioldm_encode(waveform[0]).detach() # transform mel-spectrogram(Tensor) into latent space            text_emb = self.model.module.clip_encode_text([caption]).detach()            return mel_latent, text_emb # data, condition        elif self.x == "text" and self.c == "audio":            text_latent = self.model.module.optimus_encode([caption]).detach()            audio_emb = self.model.module.clap_encode_audio(waveform[0]).detach()            return text_latent, audio_emb # data, condition### Training=============================================def train():    parser = ArgumentParser('DDP usage example')    parser.add_argument('--local_rank', type=int, default=-1, metavar='N', help='Local process rank.')  # you need this argument in your scripts for DDP to work    args = parser.parse_args()    args.is_master = args.local_rank == 0    x = os.environ['XTYPE']    c = os.environ['CTYPE']    # init    torch.cuda.set_device(args.local_rank)    dist.init_process_group(backend='nccl', init_method='env://')    print(args.local_rank)    torch.cuda.manual_seed_all(42)    torch.manual_seed(42)    np.random.seed(42)    random.seed(42)    print("model difine")    model = model_define(x, c)    model = model.to(args.local_rank)    model = DDP(model, device_ids=[args.local_rank])    # Optimizer    ema = LitEma(model)    optimizer_config = {'type': 'adam','args': {'weight_decay': 1e-4  # Weight decay                }            }    optimizer_config = ConfigObject(optimizer_config)    optimizer = get_optimizer()(model, optimizer_config)    print("data load")    dataset = MusicCaps(csv_file='/raid/m236866/md-mt/datasets/musiccaps/musiccaps-public.csv',                        audio_dir='/raid/m236866/md-mt/datasets/musiccaps/musiccaps_30',                         model=model,                         x=x,                         c=c)    sampler = DistributedSampler(dataset, rank=args.local_rank)    dataloader = DataLoader(dataset,                             batch_size=6,                             sampler=sampler,                             collate_fn=collate_fn,                             pin_memory=True,                             num_workers=os.cpu_count(),                             multiprocessing_context='spawn')                    torch.backends.cudnn.benchmark = True    print("train start")    num_epochs=2    running_loss = 0    for epoch in range(num_epochs):        model.train()        dist.barrier()        for batch_idx, (data, condition) in enumerate(dataloader):            print("epoch", epoch, "batch", batch_idx)            optimizer.zero_grad()            data = data.to(args.local_rank)            condition = condition.to(args.local_rank)            loss = model.forward(x=data, c=condition)            loss.backward()            optimizer.step()            # EMA update            ema.update(model.parameters())            running_loss += loss*data.size(0)            print(batch_idx)        dist.all_reduce(running_loss, op=dist.ReduceOp.SUM)    print(running_loss)    dist.destroy_process_group()if __name__ == "__main__":    train()

The following command was executed:

source ./ddp_train.sh

ddp_train.sh is as follows:

#!/bin/bashexport CUDA_VISIBLE_DEVICES=3,4,5,6,7export MASTER_ADDR=localhostexport MASTER_PORT=12345   export NODE_RANK=0export NUM_NODES=1export NUM_GPUS_PER_NODE=5export WORLD_SIZE=$(($NUM_NODES * $NUM_GPUS_PER_NODE))export XTYPE='audio'export CTYPE='text'python -m torch.distributed.launch --nproc_per_node=$NUM_GPUS_PER_NODE --nnodes=$NUM_NODES --node_rank $NODE_RANK train.py

When I execute this, I get the following error(Repeats on the command line):

Traceback (most recent call last):  File "<string>", line 1, in <module>  File "/home/m236866/.conda/envs/codi/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main    exitcode = _main(fd, parent_sentinel)  File "/home/m236866/.conda/envs/codi/lib/python3.8/multiprocessing/spawn.py", line 126, in _main    self = reduction.pickle.load(from_parent)  File "/home/m236866/.conda/envs/codi/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 780, in __setstate__    self.process_group = _get_default_group()  File "/home/m236866/.conda/envs/codi/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 429, in _get_default_group    raise RuntimeError(RuntimeError: Default process group has not been initialized, please make sure to call init_process_group.

Environment

OS: Ubuntu 18.04.6Python: 3.8.18Torch Version : 1.12.1+cu116GPU Name: Tesla V100-SXM3-32GB

Installed with reference to CoDi repository

conda create -n CoDi python=3.8 # prepare an environmentpip install torch==1.12.1+cu116 torchaudio==0.12.1+cu116 torchvision==0.13.1+cu116 \-f https://download.pytorch.org/whl/torch_stable.html # change cuda version higher pytorch version is doablepip install -r requirement.txt

I would like to solve the above error so that distributed learning can be done using DDP.


Reference


Viewing all articles
Browse latest Browse all 13891

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>