Quantcast
Channel: Active questions tagged python - Stack Overflow
Viewing all articles
Browse latest Browse all 23160

Convert Files Async by using ProcessPoolExecutor

$
0
0

I have this code that I am using to convert doc files to pdf, but I feel like it is not doing the correct job. I want this code to be able to handle as many files as possible at the same time when I put them in the directory.

import osimport timeimport pythoncomfrom win32com import clientfrom docx.shared import Ptfrom watchdog.observers import Observerfrom watchdog.events import FileSystemEventHandlerfrom concurrent.futures import ThreadPoolExecutorimport shutilfrom concurrent.futures import ProcessPoolExecutorimport concurrent.futuresbaseAd = r"C:\inetpub\wwwroot\utkuploads"fileList = []max_processes = os.cpu_count() or 1max_threads = os.cpu_count() or 1def createText(filename, filedetail):    with open(r"C:\inetpub\wwwroot\utkuploads\{filename}.txt".format(filename=filename), 'w') as f:        f.write(f'{filedetail}')def doc2pdf(doc_name, pdf_name, font_size=8):    pythoncom.CoInitialize()    word = client.DispatchEx("Word.Application")    if os.path.exists(pdf_name):        os.remove(pdf_name)    worddoc = word.Documents.Open(doc_name, ReadOnly=1)    worddoc.Content.Font.Size = font_size    try:        worddoc.SaveAs(pdf_name, FileFormat=17)    except Exception as e:        createText('wordToPdfException', f"{e}")    worddoc.Close()    # Quit the Word application    word.Quit()    pythoncom.CoUninitialize()    return pdf_nameclass DocFileHandler(FileSystemEventHandler):    def is_temporary_file(event, filename):        return filename.startswith("~$")    def on_created(self, event):        if not (self.is_temporary_file(os.path.basename(event.src_path)) or event.src_path.endswith('.tmp') or event.is_directory or os.path.basename(event.src_path).startswith("~$")):            fileList.append(event)def outer_is_temporary_file(filename):    return filename.startswith("~$")def createFolder(baseAd, folderName):    path = os.path.join(baseAd, folderName)    isExist = os.path.exists(path)    try:        if not isExist:            os.makedirs(path)        return path    except:        return pathdef createFolderAtt(folderName):    isExist = os.path.exists(folderName)    try:        if not isExist:            os.makedirs(folderName)        return folderName    except:        return folderNamedef mainConverter(event):    # The first [0] is root directory utkuploads the second is the file name with extension.    currentFileName = os.path.split(event.src_path)    currentFileNameSplitted = os.path.split(event.src_path)[-1]    try:        ## This part needs to work for the files that needs to be converted to PDF.        ## It catches DOCX files and takes their location by doc_path and creates a fake pdf_path directory        if event.event_type == 'created' and event.src_path.lower().endswith('.docx') and '@' not in currentFileNameSplitted and not outer_is_temporary_file(                event.src_path):            print(f"{currentFileName} STARTED")            doc_path = event.src_path            pdf_path = os.path.splitext(doc_path)[0] +'.pdf'            # print(f'Doc path: {doc_path}, \nPdf path: {pdf_path}')            # If '_' in doc_path            if '_' in doc_path:                print(f'New Template has been detected: {doc_path}')                return            # If file is not temporary, not _ (template), not attachment, not TEMPLATE-REPORT            elif '~$' not in doc_path and '_' not in doc_path and '@' not in doc_path and 'TEMPLATE-REPORT' not in doc_path:                # print(f"File will be converted here: {doc_path}")                try:                    if '-GENERATED-REPORT' in doc_path:                        # Here pdf convertion happens.                        doc2pdf(doc_path, pdf_path)                        # Create subFolder based on PDF file.                        createFolderPath = os.path.split(pdf_path)[-1].split(".")[0]                        createFolderPath = createFolderPath.replace('-GENERATED-REPORT', '')                        try:                            newFolderPath = createFolder(baseAd, createFolderPath)                        except Exception as error:                            createText('createFolderGeneratedReport', f'{error}')                        # print(f"New folder has been created: {newFolderPath}")                        pdfFileName = os.path.split(pdf_path)[-1]                        src_pdf = pdf_path                        dest_pathPdf = os.path.join(newFolderPath, pdfFileName)                        shutil.move(src_pdf, dest_pathPdf)                        # print(f"File has been moved to its destination. src: {src_pdf}, destination: {dest_pathPdf} ")                        # print('Doc path', doc_path)                        wordFileName = os.path.split(doc_path)[-1]                        wordPdf = wordFileName                        dest_pathWord = os.path.join(newFolderPath, wordPdf)                        shutil.move(doc_path, dest_pathWord)                    # print( f"Generated Rapor File has been moved to its destination. src: {doc_path}, destination: {dest_pathWord} ")                    elif '-IMZALIRAPOR' in doc_path:                        # Here pdf convertion happens.                        doc2pdf(doc_path, pdf_path)                        # Create subFolder based on PDF file.                        createFolderPath = os.path.split(pdf_path)[-1].split(".")[0]                        createFolderPath = createFolderPath.replace('-IMZALIRAPOR', '')                        try:                            newFolderPath = createFolder(baseAd, createFolderPath)                        except Exception as error:                            createText('CreateFolderImzaliRapor', f'{error}')                        # print(f"New folder has been created: {newFolderPath}")                        pdfFileName = os.path.split(pdf_path)[-1]                        src_pdf = pdf_path                        dest_pathPdf = os.path.join(newFolderPath, pdfFileName)                        shutil.move(src_pdf, dest_pathPdf)                        # print(f"File has been moved to its destination. src: {src_pdf}, destination: {dest_pathPdf} ")                        # print('Doc path', doc_path)                        wordFileName = os.path.split(doc_path)[-1]                        wordPdf = wordFileName                        dest_pathWord = os.path.join(newFolderPath, wordPdf)                        shutil.move(doc_path, dest_pathWord)                        # print(f"Imzali Report File has been moved to its destination. src: {doc_path}, destination: {dest_pathWord} ")                    elif 'GENERATED-REPORT' not in doc_path and '-IMZALIRAPOR' not in doc_path and '@' not in doc_path:                        # Here pdf convertion happens.                        doc2pdf(doc_path, pdf_path)                        # Create subFolder based on PDF file.                        createFolderPath = os.path.split(pdf_path)[-1].split(".")[0]                        try:                            newFolderPath = createFolder(baseAd, createFolderPath)                        except Exception as error:                            createText('buAnaModel', f'{error}')                        # print(f"New folder has been created: {newFolderPath}")                        pdfFileName = os.path.split(pdf_path)[-1]                        src_pdf = pdf_path                        dest_pathPdf = os.path.join(newFolderPath, pdfFileName)                        shutil.move(src_pdf, dest_pathPdf)                        # print(f"File has been moved to its destination. src: {src_pdf}, destination: {dest_pathPdf} ")                        # print('Doc path', doc_path)                        wordFileName = os.path.split(doc_path)[-1]                        wordPdf = wordFileName                        dest_pathWord = os.path.join(newFolderPath, wordPdf)                        shutil.move(doc_path, dest_pathWord)                        # print(f"File has been moved to its destination. src: {doc_path}, destination: {dest_pathWord} ")                except Exception as e:                    createText('exceptionHasOccured...', f'{e}')            print(f"{currentFileName} FINISHED")        elif event.event_type == 'created' and '@' in currentFileNameSplitted and not outer_is_temporary_file(                event.src_path):            print(f"{currentFileName} ATTACHMENT STARTED")            doc_path = event.src_path            folderPath = currentFileNameSplitted.split("@")[1].split(".")[0]            try:                baseFolderPath = os.path.split(doc_path)[:-1][0]                # print(f"Attachments detected: {doc_path}, {currentFileNameSplitted}, {baseFolderPath}")                dest_path = os.path.join(baseFolderPath, folderPath, currentFileNameSplitted)                try:                    shutil.move(doc_path, dest_path)                    print(f"{currentFileName} ATTACHMENT MOVED")                except:                    try:                        createFolderAtt(os.path.join(baseFolderPath, folderPath))                        shutil.move(doc_path, dest_path)                        print(f"{currentFileName} ATTACHMENT MOVED")                    except Exception as e:                        createText('InnerAttachmentError', f'{e}')            except Exception as e:                createText('outerAttachmentErrorOccured', f'{e}')    except Exception as e:        createText('outerAllExceptionasOccured', f'{e}')def main():    with ProcessPoolExecutor(max_processes) as executor:    #with concurrent.futures.ProcessPoolExecutor(max_processes) as executor:        while True:            if fileList:                file_to_process = fileList.pop()                print('File has been sent', file_to_process)                executor.submit(mainConverter, file_to_process)"""with ThreadPoolExecutor(max_threads) as executor:        while True:            if fileList:                file_to_process = fileList.pop()                print('File has been sent', file_to_process)                executor.submit(mainConverter, file_to_process)"""if __name__ == '__main__':    directory_to_watch = r"C:\inetpub\wwwroot\utkuploads"    event_handler = DocFileHandler()    observer = Observer()    observer.schedule(event_handler, path=directory_to_watch, recursive=False)    observer.start()    try:        main()    except KeyboardInterrupt:        observer.stop()    observer.join()

I have tried using three different ones the best performing one is ProcessPoolExecutor, but it doesn't convert files at the same time. It doesn't open up like 5 WORD and converts them async.

with ProcessPoolExecutor(max_processes) as executor:with concurrent.futures.ProcessPoolExecutor(max_processes) as executor:with ThreadPoolExecutor(max_threads) as executor:

What am I doing wrong?

@BoobooReply to Booboo:I have done the same. Run the code and copied few files to the folder but they got stuck in here.

Hit Enter to terminate: <FileCreatedEvent: event_type=created, src_path='C:\\inetpub\\wwwroot\\utkuploads\\4321-1705055733693-GENERATED-REPORT.docx', is_directory=False> C:\inetpub\wwwroot\utkuploads\4321-1705055733693-GENERATED-REPORT.docxFile has been sent <FileCreatedEvent: event_type=created, src_path='C:\\inetpub\\wwwroot\\utkuploads\\4321-1705055733693-GENERATED-REPORT.docx', is_directory=False><FileCreatedEvent: event_type=created, src_path='C:\\inetpub\\wwwroot\\utkuploads\\4321-1705055677609.docx', is_directory=False> C:\inetpub\wwwroot\utkuploads\4321-1705055677609.docxFile has been sent <FileCreatedEvent: event_type=created, src_path='C:\\inetpub\\wwwroot\\utkuploads\\4321-1705055677609.docx', is_directory=False><FileCreatedEvent: event_type=created, src_path='C:\\inetpub\\wwwroot\\utkuploads\\4321-1705055677609-GENERATED-REPORT.docx', is_directory=False> C:\inetpub\wwwroot\utkuploads\4321-1705055677609-GENERATED-REPORT.docxFile has been sent <FileCreatedEvent: event_type=created, src_path='C:\\inetpub\\wwwroot\\utkuploads\\4321-1705055677609-GENERATED-REPORT.docx', is_directory=False><FileCreatedEvent: event_type=created, src_path='C:\\inetpub\\wwwroot\\utkuploads\\4321-1705055683658-GENERATED-REPORT.docx', is_directory=False> C:\inetpub\wwwroot\utkuploads\4321-1705055683658-GENERATED-REPORT.docxFile has been sent <FileCreatedEvent: event_type=created, src_path='C:\\inetpub\\wwwroot\\utkuploads\\4321-1705055683658-GENERATED-REPORT.docx', is_directory=False><FileCreatedEvent: event_type=created, src_path='C:\\inetpub\\wwwroot\\utkuploads\\4321-1705055733693.docx', is_directory=False> C:\inetpub\wwwroot\utkuploads\4321-1705055733693.docxFile has been sent <FileCreatedEvent: event_type=created, src_path='C:\\inetpub\\wwwroot\\utkuploads\\4321-1705055733693.docx', is_directory=False>Then i hit enter and this conversions happend:('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055683658.docx') STARTED('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055683658-GENERATED-REPORT.docx') STARTED('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055733693.docx') STARTED('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055733693-GENERATED-REPORT.docx') STARTED('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055677609.docx') STARTED('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055677609-GENERATED-REPORT.docx') STARTED('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055683658-GENERATED-REPORT.docx') FINISHED('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055683658.docx') FINISHED('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055733693.docx') FINISHED('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055733693-GENERATED-REPORT.docx') FINISHED('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055677609.docx') FINISHED('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055677609-GENERATED-REPORT.docx') FINISHED

Viewing all articles
Browse latest Browse all 23160

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>