I'm trying to build a chatbot that uses a stored chroma db containing product information to make recommendations. The chatbot works fine to make a recommendation based on user input but then when the same user asks additional questions such as 'what is the price of the items you recommended' it gives prices of other unlrelated products, not taking into account the previous recommendation.
Here is a copy of my code:
import osimport torchfrom transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline)from datasets import load_datasetfrom peft import LoraConfig, PeftModelfrom langchain.text_splitter import CharacterTextSplitterfrom langchain.document_transformers import Html2TextTransformerfrom langchain.document_loaders import AsyncChromiumLoaderfrom langchain.embeddings.huggingface import HuggingFaceEmbeddingsfrom langchain.vectorstores import FAISSfrom langchain.prompts import PromptTemplatefrom langchain.schema.runnable import RunnablePassthroughfrom langchain.llms import HuggingFacePipelinefrom langchain.chains import LLMChainfrom langchain.vectorstores import Chromaimport chromadbimport gradio as grfrom langchain.schema import format_documentfrom langchain_core.messages import get_buffer_stringfrom langchain_core.runnables import RunnableLambda, RunnablePassthroughfrom langchain.memory import ConversationBufferMemoryfrom langchain.prompts.prompt import PromptTemplatefrom langchain_core.prompts.chat import ChatPromptTemplatefrom langchain.chains import ConversationalRetrievalChainfrom langchain.chains.conversation.memory import ConversationBufferMemoryfrom langchain.chains import ConversationChain################################################################## Tokenizer#################################################################model_name='mistralai/Mistral-7B-Instruct-v0.1'tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)tokenizer.pad_token = tokenizer.eos_tokentokenizer.padding_side = "right"################################################################## bitsandbytes parameters################################################################## Activate 4-bit precision base model loadinguse_4bit = True# Compute dtype for 4-bit base modelsbnb_4bit_compute_dtype = "float16"# Quantization type (fp4 or nf4)bnb_4bit_quant_type = "nf4"# Activate nested quantization for 4-bit base models (double quantization)use_nested_quant = False################################################################## Set up quantization config#################################################################compute_dtype = getattr(torch, bnb_4bit_compute_dtype)bnb_config = BitsAndBytesConfig( load_in_4bit=use_4bit, bnb_4bit_quant_type=bnb_4bit_quant_type, bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=use_nested_quant,)# Check GPU compatibility with bfloat16# if compute_dtype == torch.float16 and use_4bit:# major, _ = torch.cuda.get_device_capability()# if major >= 8:# print("=" * 80)# print("Your GPU supports bfloat16: accelerate training with bf16=True")# print("=" * 80)################################################################## Load pre-trained config#################################################################model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=bnb_config,)# LOAD VECTOR DBdb = Chroma(persist_directory="./",embedding_function=HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'))retriever = db.as_retriever()text_generation_pipeline = pipeline( model=model, tokenizer=tokenizer, task="text-generation", temperature=0.2, repetition_penalty=1.1, return_full_text=True, max_new_tokens=1000,)mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)template = """### [INST] Instruction: You are a sales assistant and your role is to advise clients taking into accounttheir requests. You have to respond in English and give 3 recommendations together with the links of the products. {context}### QUESTION:{question} [/INST][/INST]"""from langchain.chains import ConversationalRetrievalChain,RetrievalQAprompt = PromptTemplate(input_variables=["context", "question"], template=template)llm = mistral_llmmemory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)qa = RetrievalQA.from_chain_type(llm, retriever=retriever, memory=memory,chain_type_kwargs={'prompt': prompt})qa.memory.clear()question1= "I'm looking for a white whine with tropical flavours"result1['result']question2= "what is the price of these 3 wines?"result2 = qa({"query": question2})result2['result']As mentioned, result1['result'] works great, but when i follow up with the question2, it does not seem to remember the conversation..?
Thank you