Quantcast
Channel: Active questions tagged python - Stack Overflow
Viewing all articles
Browse latest Browse all 23131

Splitted PDF Files are being duplicated when Splitted using Text - Python Script

$
0
0

I am trying to split 100 of PDFs with python by specific keyword.If a page in Python contains that keyword, split that page into a new PDF.The Problem i am facing is, the files are being duplicated. I have tried everything to stop the duplication process but it won't change.

import osimport fitz  # PyMuPDFdef split_pdf_by_text(pdf_path, keyword, output_folder):    # Check if the provided path is a valid file    if not os.path.isfile(pdf_path):        print(f"Error: '{pdf_path}' is not a valid file.")        return    # Create output folder if it doesn't exist    if not os.path.exists(output_folder):        os.makedirs(output_folder)    print(f"Processing PDF: {pdf_path}")    # Open the PDF file    pdf_document = fitz.open(pdf_path)    # Initialize a set to keep track of processed pages    processed_pages = set()    # Iterate through each page    for page_number in range(len(pdf_document)):        # Skip the page if it's already processed        if page_number in processed_pages:            continue        # Get the page        page = pdf_document.load_page(page_number)        # Extract text from the page        text = page.get_text()        # Check if the keyword exists in the page text        if keyword in text:            # Construct the output file path            output_file_name = f"{os.path.splitext(os.path.basename(pdf_path))[0]}_page_{page_number + 1}.pdf"            output_path = os.path.join(output_folder, output_file_name)            # Create a new PDF document            new_pdf = fitz.open()            new_pdf.insert_pdf(pdf_document, from_page=page_number, to_page=page_number)  # Insert the current page into the new PDF            new_pdf.save(output_path)  # Save the new PDF            print(f"Page {page_number + 1} saved to: {output_path}")            # Close the new PDF            new_pdf.close()            # Add the page number to the set of processed pages            processed_pages.add(page_number)    # Close the original PDF    pdf_document.close()# Define the function to process all PDF files in a directorydef process_all_pdfs(input_folder, keyword, output_folder):    # Iterate through each file in the input folder    for root, _, files in os.walk(input_folder):        for file in files:            if file.endswith(".pdf"):                # Get the full path of the PDF file                pdf_path = os.path.join(root, file)                # Process the PDF file                split_pdf_by_text(pdf_path, keyword, output_folder)

If a page was processed, I tried to skip it and move along to the next page, but it's not working out for me.


Viewing all articles
Browse latest Browse all 23131

Trending Articles