I've been trialling this example to build a Python AI chatbot, which scrapes a website; stores the data and uses OpenAI.
Right now, I have scrape.py which gets the website stored in the .env file and saves this into Apify. This is working as expected and that data exists. Code below:
scrape.py
import osfrom apify_client import ApifyClientfrom dotenv import load_dotenvfrom langchain_community.document_loaders import ApifyDatasetLoaderfrom langchain_community.document_loaders.base import Documentfrom langchain_community.embeddings.openai import OpenAIEmbeddingsfrom langchain.text_splitter import RecursiveCharacterTextSplitterfrom langchain_community.vectorstores import Chroma# Load environment variables from a .env fileload_dotenv()if __name__ == '__main__': apify_client = ApifyClient(os.environ.get('APIFY_API_TOKEN')) website_url = os.environ.get('WEBSITE_URL') print(f'Extracting data from "{website_url}". Please wait...') actor_run_info = apify_client.actor('apify/website-content-crawler').call( run_input={'startUrls': [{'url': website_url}]} ) print('Saving data into the vector database. Please wait...') loader = ApifyDatasetLoader( dataset_id=actor_run_info['defaultDatasetId'], dataset_mapping_function=lambda item: Document( page_content=item['text'] or '', metadata={'source': item['url']} ), ) documents = loader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100) docs = text_splitter.split_documents(documents) embedding = OpenAIEmbeddings() vectordb = Chroma.from_documents( documents=docs, embedding=embedding, persist_directory='db2', ) vectordb.persist() print('All done!')
Next, we have chat.py which uses stream to open a chatbot, and ask questions about the website you have in your .env file. The current issue I have is the chatbot works, but when I ask a question such as 'what are the opening hours' - instead of using the stored data it looks to be trying to browse the internet, to which it says 'I'm sorry, but I don't have access to real-time information or the ability to browse the internet.'
I'm unsure as to why this is, and can't seem to find much more information online. Any help or guidance would be incredible!
chat.py
import osimport streamlit as stfrom dotenv import load_dotenvfrom langchain.callbacks.base import BaseCallbackHandlerfrom langchain.chains import ConversationalRetrievalChainfrom langchain.chat_models import ChatOpenAIfrom langchain.embeddings import OpenAIEmbeddingsfrom langchain.memory import ConversationBufferMemoryfrom langchain.memory.chat_message_histories import StreamlitChatMessageHistoryfrom langchain.vectorstores import Chromaload_dotenv()website_url = os.environ.get('WEBSITE_URL', 'a website')st.set_page_config(page_title=f'Chat with {website_url}')st.title('Learn about Alnwick gardens')@st.cache_resource(ttl='1h')def get_retriever(): embeddings = OpenAIEmbeddings() vectordb = Chroma(persist_directory='db', embedding_function=embeddings) retriever = vectordb.as_retriever(search_type='mmr') return retrieverclass StreamHandler(BaseCallbackHandler): def __init__(self, container: st.delta_generator.DeltaGenerator, initial_text: str = ''): self.container = container self.text = initial_text def on_llm_new_token(self, token: str, **kwargs) -> None: self.text += token self.container.markdown(self.text)retriever = get_retriever()msgs = StreamlitChatMessageHistory()memory = ConversationBufferMemory(memory_key='chat_history', chat_memory=msgs, return_messages=True)llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0, streaming=True)qa_chain = ConversationalRetrievalChain.from_llm( llm, retriever=retriever, memory=memory, verbose=False)if st.sidebar.button('Clear message history') or len(msgs.messages) == 0: msgs.clear() msgs.add_ai_message(f'Ask me anything about {website_url}!')avatars = {'human': 'user', 'ai': 'assistant'}for msg in msgs.messages: st.chat_message(avatars[msg.type]).write(msg.content)if user_query := st.chat_input(placeholder='Ask me anything!'): st.chat_message('user').write(user_query) with st.chat_message('assistant'): stream_handler = StreamHandler(st.empty()) response = qa_chain.run(user_query, callbacks=[stream_handler])