I am trying to split 100 of PDFs with python by specific keyword.If a page in Python contains that keyword, split that page into a new PDF.The Problem i am facing is, the files are being duplicated. I have tried everything to stop the duplication process but it won't change.
import osimport fitz # PyMuPDFdef split_pdf_by_text(pdf_path, keyword, output_folder): # Check if the provided path is a valid file if not os.path.isfile(pdf_path): print(f"Error: '{pdf_path}' is not a valid file.") return # Create output folder if it doesn't exist if not os.path.exists(output_folder): os.makedirs(output_folder) print(f"Processing PDF: {pdf_path}") # Open the PDF file pdf_document = fitz.open(pdf_path) # Initialize a set to keep track of processed pages processed_pages = set() # Iterate through each page for page_number in range(len(pdf_document)): # Skip the page if it's already processed if page_number in processed_pages: continue # Get the page page = pdf_document.load_page(page_number) # Extract text from the page text = page.get_text() # Check if the keyword exists in the page text if keyword in text: # Construct the output file path output_file_name = f"{os.path.splitext(os.path.basename(pdf_path))[0]}_page_{page_number + 1}.pdf" output_path = os.path.join(output_folder, output_file_name) # Create a new PDF document new_pdf = fitz.open() new_pdf.insert_pdf(pdf_document, from_page=page_number, to_page=page_number) # Insert the current page into the new PDF new_pdf.save(output_path) # Save the new PDF print(f"Page {page_number + 1} saved to: {output_path}") # Close the new PDF new_pdf.close() # Add the page number to the set of processed pages processed_pages.add(page_number) # Close the original PDF pdf_document.close()# Define the function to process all PDF files in a directorydef process_all_pdfs(input_folder, keyword, output_folder): # Iterate through each file in the input folder for root, _, files in os.walk(input_folder): for file in files: if file.endswith(".pdf"): # Get the full path of the PDF file pdf_path = os.path.join(root, file) # Process the PDF file split_pdf_by_text(pdf_path, keyword, output_folder)If a page was processed, I tried to skip it and move along to the next page, but it's not working out for me.