import osfrom langchain.llms import OpenAIimport bs4import langchainfrom langchain import hubfrom langchain.document_loaders import UnstructuredFileLoaderfrom langchain.embeddings import OpenAIEmbeddingsfrom langchain.text_splitter import RecursiveCharacterTextSplitterfrom langchain.vectorstores import Chromaos.environ["OPENAI_API_KEY"] = "KEY"loader = UnstructuredFileLoader('path_to_file')docs = loader.load()text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, add_start_index=True)all_splits = text_splitter.split_documents(docs)vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})retrieved_docs = retriever.get_relevant_documents("What is X?")This returns:
[Document(page_content="...", metadata={'source': 'path_to_text', 'start_index': 16932}), Document(page_content="...", metadata={'source': 'path_to_text', 'start_index': 16932}), Document(page_content="...", metadata={'source': 'path_to_text', 'start_index': 16932}), Document(page_content="...", metadata={'source': 'path_to_text', 'start_index': 16932}), Document(page_content="...", metadata={'source': 'path_to_text', 'start_index': 16932}), Document(page_content="...", metadata={'source': 'path_to_text', 'start_index': 16932})]Which is all seemingly the same document.
When I first ran this code in Google Colab/Jupyter Notebook, it returned different documents...as I ran it more, it started returning the same documents. Makes me feel like this is a database issue, where the same entry is being inserted into the database with each run.
How do I return 6 different unique documents?