I want to use the paraphrase-multilingual-mpnet-base-v2 model to build embeddings and I got this error:
RuntimeError: CUDA error: device-side assert triggered
The error occurs by executing string = {k: v.to(device=device) for k, v in string.items()}
.
Why do I get the error?
I work in a Google Colab with 12.7 GB RAM and 16 GB GPU-RAM
The goal of the code is to generate sentence embeddings. With some customizing is a chunk-wise execution also possible.
The complete error message:
RuntimeError Traceback (most recent call last) <ipython-input-17-8e6bf00d9e24> in <cell line: 104>() 102 return np.nan 103 --> 104 processed_data = processDataRAG(df[5000:], tokenizer, model)4 frames <ipython-input-17-8e6bf00d9e24> in processDataRAG(data, tokenizer, model) 10 sents = [str(sentences[0]) for sentences in article_sentences] 11 number_of_article =[sentences[1] for sentences in article_sentences]---> 12 embedded_sentencs = [embeddChunkwise(sentence, tokenizer, model, 512) for sentence in tqdm(sents, desc = "Create chunk-wise embeddings")] 13 return pd.DataFrame({ 14 "sentences": sents,<ipython-input-17-8e6bf00d9e24> in <listcomp>(.0) 10 sents = [str(sentences[0]) for sentences in article_sentences] 11 number_of_article =[sentences[1] for sentences in article_sentences]---> 12 embedded_sentencs = [embeddChunkwise(sentence, tokenizer, model, 512) for sentence in tqdm(sents, desc = "Create chunk-wise embeddings")] 13 return pd.DataFrame({ 14 "sentences": sents,<ipython-input-17-8e6bf00d9e24> in embeddChunkwise(string, tokenizer, model, chunk_size) 55 #encoded_input = tokenizer(tokenizer.detokenize(tokenized_chunk)) 56 if len(encoded_chunk) > 0:---> 57 embedded_chunk = createEmbeddings( 58 tokenizer(tokenizer.decode(encoded_chunk, skip_special_tokens = True), return_tensors='pt', add_special_tokens=False), 59 model<ipython-input-17-8e6bf00d9e24> in createEmbeddings(string, model) 77 #print("Length of input_ids: ", len(string["input_ids"][0])) 78 if "input_ids" in string.keys():---> 79 string = {k: v.to(device=device) for k, v in string.items()} 80 with torch.no_grad(): 81 <ipython-input-17-8e6bf00d9e24> in <dictcomp>(.0) 77 #print("Length of input_ids: ", len(string["input_ids"][0])) 78 if "input_ids" in string.keys():---> 79 string = {k: v.to(device=device) for k, v in string.items()} 80 with torch.no_grad(): 81 RuntimeError: CUDA error: device-side assert triggered CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1. Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
I run this code:
from transformers import AutoTokenizer, AutoModelimport torchfrom torch import cudadef mean_pooling(model_output, attention_mask): token_embeddings = model_output[0] #First element of model_output contains all token embeddings input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)# Select device globallydevice = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'# Load model from HuggingFace Hubtokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2', device_map = device)df = pd.read_json(file_path)def processDataRAG(data, tokenizer, model): article_sentences = data.content.progress_apply(lambda x: list(nlp_de(x).sents)) #tokenized_articles = data.content.progress_apply(lambda article: tokenizeChunkwise(article, tokenizer, 512)) article_sentences = [ (sentences, idx) for idx, article in tqdm(enumerate(list(article_sentences)), desc="Loop over articles with index") for sentences in article ] sents = [str(sentences[0]) for sentences in article_sentences] number_of_article =[sentences[1] for sentences in article_sentences] embedded_sentencs = [embeddChunkwise(sentence, tokenizer, model, 512) for sentence in tqdm(sents, desc = "Create chunk-wise embeddings")] return pd.DataFrame({"sentences": sents,"embeddings": embedded_sentencs,"article": number_of_article })def embeddChunkwise(string, tokenizer, model, chunk_size): decreasing_by_special_tokens = 0 # Because of speical tokens at the beginning and end encoded_string = tokenizer(string, add_special_tokens=False) if len(encoded_string["input_ids"])/chunk_size > 1: print("Tokenized_string:", encoded_string) print("Total tokens: ", str(len(encoded_string["input_ids"]))) print("Tokenized string in chunks: ", str(len(encoded_string["input_ids"])/chunk_size), " --- " , str(len(encoded_string["input_ids"])//chunk_size +1)) embedded_chunks = [] for idx in list(range(len(encoded_string["input_ids"])//chunk_size +1 )): encoded_chunk=None if (chunk_size-decreasing_by_special_tokens)*(idx+1) < len(encoded_string["input_ids"]): # sentences with 1000 words as instances start_idx, end_idx = (chunk_size*idx - decreasing_by_special_tokens*idx, chunk_size*(idx+1) - decreasing_by_special_tokens*(idx+1)) encoded_chunk = encoded_string["input_ids"][start_idx:end_idx] else: # If it is a sentences with 20 words as instance if chunk_size-decreasing_by_special_tokens > len(encoded_string["input_ids"]): encoded_chunk = encoded_string["input_ids"][chunk_size*(idx) - decreasing_by_special_tokens*(idx):] else: encoded_chunk = encoded_string["input_ids"][-(chunk_size*(idx) - decreasing_by_special_tokens*(idx)):] if len(encoded_chunk) > 0: embedded_chunk = createEmbeddings( tokenizer(tokenizer.decode(encoded_chunk, skip_special_tokens = True), return_tensors='pt', add_special_tokens=False), model ) if isinstance(embedded_chunk, list): embedded_chunks.append(embedded_chunk[0]) if len(embedded_chunks) > 1: return embedded_chunks elif len(embedded_chunks) == 0: return np.nan else: return embedded_chunks[0]def createEmbeddings(string, model): if "input_ids" in string.keys(): string = {k: v.to(device=device) for k, v in string.items()} with torch.no_grad(): try: model_output = model(**string) except Exception as ex: print("--- Error by creating Embeddings ---") print("Error: ", str(ex)) return np.nan # Perform pooling. In this case, average pooling try: sentence_embeddings = mean_pooling(model_output, string['attention_mask']) except Exception as ex: print("--- Error by pooling embeddings ---") print("Model output: ", str(model_output)) print("Attention_mask: ", str(string['attention_mask'])) print("Error: ", str(ex)) return np.nan sentence_embeddings = sentence_embeddings.detach().cpu().numpy() return sentence_embeddings else: return np.nan