I have this code that I am using to convert doc files to pdf, but I feel like it is not doing the correct job. I want this code to be able to handle as many files as possible at the same time when I put them in the directory.
import osimport timeimport pythoncomfrom win32com import clientfrom docx.shared import Ptfrom watchdog.observers import Observerfrom watchdog.events import FileSystemEventHandlerfrom concurrent.futures import ThreadPoolExecutorimport shutilfrom concurrent.futures import ProcessPoolExecutorimport concurrent.futuresbaseAd = r"C:\inetpub\wwwroot\utkuploads"fileList = []max_processes = os.cpu_count() or 1max_threads = os.cpu_count() or 1def createText(filename, filedetail): with open(r"C:\inetpub\wwwroot\utkuploads\{filename}.txt".format(filename=filename), 'w') as f: f.write(f'{filedetail}')def doc2pdf(doc_name, pdf_name, font_size=8): pythoncom.CoInitialize() word = client.DispatchEx("Word.Application") if os.path.exists(pdf_name): os.remove(pdf_name) worddoc = word.Documents.Open(doc_name, ReadOnly=1) worddoc.Content.Font.Size = font_size try: worddoc.SaveAs(pdf_name, FileFormat=17) except Exception as e: createText('wordToPdfException', f"{e}") worddoc.Close() # Quit the Word application word.Quit() pythoncom.CoUninitialize() return pdf_nameclass DocFileHandler(FileSystemEventHandler): def is_temporary_file(event, filename): return filename.startswith("~$") def on_created(self, event): if not (self.is_temporary_file(os.path.basename(event.src_path)) or event.src_path.endswith('.tmp') or event.is_directory or os.path.basename(event.src_path).startswith("~$")): fileList.append(event)def outer_is_temporary_file(filename): return filename.startswith("~$")def createFolder(baseAd, folderName): path = os.path.join(baseAd, folderName) isExist = os.path.exists(path) try: if not isExist: os.makedirs(path) return path except: return pathdef createFolderAtt(folderName): isExist = os.path.exists(folderName) try: if not isExist: os.makedirs(folderName) return folderName except: return folderNamedef mainConverter(event): # The first [0] is root directory utkuploads the second is the file name with extension. currentFileName = os.path.split(event.src_path) currentFileNameSplitted = os.path.split(event.src_path)[-1] try: ## This part needs to work for the files that needs to be converted to PDF. ## It catches DOCX files and takes their location by doc_path and creates a fake pdf_path directory if event.event_type == 'created' and event.src_path.lower().endswith('.docx') and '@' not in currentFileNameSplitted and not outer_is_temporary_file( event.src_path): print(f"{currentFileName} STARTED") doc_path = event.src_path pdf_path = os.path.splitext(doc_path)[0] +'.pdf' # print(f'Doc path: {doc_path}, \nPdf path: {pdf_path}') # If '_' in doc_path if '_' in doc_path: print(f'New Template has been detected: {doc_path}') return # If file is not temporary, not _ (template), not attachment, not TEMPLATE-REPORT elif '~$' not in doc_path and '_' not in doc_path and '@' not in doc_path and 'TEMPLATE-REPORT' not in doc_path: # print(f"File will be converted here: {doc_path}") try: if '-GENERATED-REPORT' in doc_path: # Here pdf convertion happens. doc2pdf(doc_path, pdf_path) # Create subFolder based on PDF file. createFolderPath = os.path.split(pdf_path)[-1].split(".")[0] createFolderPath = createFolderPath.replace('-GENERATED-REPORT', '') try: newFolderPath = createFolder(baseAd, createFolderPath) except Exception as error: createText('createFolderGeneratedReport', f'{error}') # print(f"New folder has been created: {newFolderPath}") pdfFileName = os.path.split(pdf_path)[-1] src_pdf = pdf_path dest_pathPdf = os.path.join(newFolderPath, pdfFileName) shutil.move(src_pdf, dest_pathPdf) # print(f"File has been moved to its destination. src: {src_pdf}, destination: {dest_pathPdf} ") # print('Doc path', doc_path) wordFileName = os.path.split(doc_path)[-1] wordPdf = wordFileName dest_pathWord = os.path.join(newFolderPath, wordPdf) shutil.move(doc_path, dest_pathWord) # print( f"Generated Rapor File has been moved to its destination. src: {doc_path}, destination: {dest_pathWord} ") elif '-IMZALIRAPOR' in doc_path: # Here pdf convertion happens. doc2pdf(doc_path, pdf_path) # Create subFolder based on PDF file. createFolderPath = os.path.split(pdf_path)[-1].split(".")[0] createFolderPath = createFolderPath.replace('-IMZALIRAPOR', '') try: newFolderPath = createFolder(baseAd, createFolderPath) except Exception as error: createText('CreateFolderImzaliRapor', f'{error}') # print(f"New folder has been created: {newFolderPath}") pdfFileName = os.path.split(pdf_path)[-1] src_pdf = pdf_path dest_pathPdf = os.path.join(newFolderPath, pdfFileName) shutil.move(src_pdf, dest_pathPdf) # print(f"File has been moved to its destination. src: {src_pdf}, destination: {dest_pathPdf} ") # print('Doc path', doc_path) wordFileName = os.path.split(doc_path)[-1] wordPdf = wordFileName dest_pathWord = os.path.join(newFolderPath, wordPdf) shutil.move(doc_path, dest_pathWord) # print(f"Imzali Report File has been moved to its destination. src: {doc_path}, destination: {dest_pathWord} ") elif 'GENERATED-REPORT' not in doc_path and '-IMZALIRAPOR' not in doc_path and '@' not in doc_path: # Here pdf convertion happens. doc2pdf(doc_path, pdf_path) # Create subFolder based on PDF file. createFolderPath = os.path.split(pdf_path)[-1].split(".")[0] try: newFolderPath = createFolder(baseAd, createFolderPath) except Exception as error: createText('buAnaModel', f'{error}') # print(f"New folder has been created: {newFolderPath}") pdfFileName = os.path.split(pdf_path)[-1] src_pdf = pdf_path dest_pathPdf = os.path.join(newFolderPath, pdfFileName) shutil.move(src_pdf, dest_pathPdf) # print(f"File has been moved to its destination. src: {src_pdf}, destination: {dest_pathPdf} ") # print('Doc path', doc_path) wordFileName = os.path.split(doc_path)[-1] wordPdf = wordFileName dest_pathWord = os.path.join(newFolderPath, wordPdf) shutil.move(doc_path, dest_pathWord) # print(f"File has been moved to its destination. src: {doc_path}, destination: {dest_pathWord} ") except Exception as e: createText('exceptionHasOccured...', f'{e}') print(f"{currentFileName} FINISHED") elif event.event_type == 'created' and '@' in currentFileNameSplitted and not outer_is_temporary_file( event.src_path): print(f"{currentFileName} ATTACHMENT STARTED") doc_path = event.src_path folderPath = currentFileNameSplitted.split("@")[1].split(".")[0] try: baseFolderPath = os.path.split(doc_path)[:-1][0] # print(f"Attachments detected: {doc_path}, {currentFileNameSplitted}, {baseFolderPath}") dest_path = os.path.join(baseFolderPath, folderPath, currentFileNameSplitted) try: shutil.move(doc_path, dest_path) print(f"{currentFileName} ATTACHMENT MOVED") except: try: createFolderAtt(os.path.join(baseFolderPath, folderPath)) shutil.move(doc_path, dest_path) print(f"{currentFileName} ATTACHMENT MOVED") except Exception as e: createText('InnerAttachmentError', f'{e}') except Exception as e: createText('outerAttachmentErrorOccured', f'{e}') except Exception as e: createText('outerAllExceptionasOccured', f'{e}')def main(): with ProcessPoolExecutor(max_processes) as executor: #with concurrent.futures.ProcessPoolExecutor(max_processes) as executor: while True: if fileList: file_to_process = fileList.pop() print('File has been sent', file_to_process) executor.submit(mainConverter, file_to_process)"""with ThreadPoolExecutor(max_threads) as executor: while True: if fileList: file_to_process = fileList.pop() print('File has been sent', file_to_process) executor.submit(mainConverter, file_to_process)"""if __name__ == '__main__': directory_to_watch = r"C:\inetpub\wwwroot\utkuploads" event_handler = DocFileHandler() observer = Observer() observer.schedule(event_handler, path=directory_to_watch, recursive=False) observer.start() try: main() except KeyboardInterrupt: observer.stop() observer.join()I have tried using three different ones the best performing one is ProcessPoolExecutor, but it doesn't convert files at the same time. It doesn't open up like 5 WORD and converts them async.
with ProcessPoolExecutor(max_processes) as executor:with concurrent.futures.ProcessPoolExecutor(max_processes) as executor:with ThreadPoolExecutor(max_threads) as executor:What am I doing wrong?
@BoobooReply to Booboo:I have done the same. Run the code and copied few files to the folder but they got stuck in here.
Hit Enter to terminate: <FileCreatedEvent: event_type=created, src_path='C:\\inetpub\\wwwroot\\utkuploads\\4321-1705055733693-GENERATED-REPORT.docx', is_directory=False> C:\inetpub\wwwroot\utkuploads\4321-1705055733693-GENERATED-REPORT.docxFile has been sent <FileCreatedEvent: event_type=created, src_path='C:\\inetpub\\wwwroot\\utkuploads\\4321-1705055733693-GENERATED-REPORT.docx', is_directory=False><FileCreatedEvent: event_type=created, src_path='C:\\inetpub\\wwwroot\\utkuploads\\4321-1705055677609.docx', is_directory=False> C:\inetpub\wwwroot\utkuploads\4321-1705055677609.docxFile has been sent <FileCreatedEvent: event_type=created, src_path='C:\\inetpub\\wwwroot\\utkuploads\\4321-1705055677609.docx', is_directory=False><FileCreatedEvent: event_type=created, src_path='C:\\inetpub\\wwwroot\\utkuploads\\4321-1705055677609-GENERATED-REPORT.docx', is_directory=False> C:\inetpub\wwwroot\utkuploads\4321-1705055677609-GENERATED-REPORT.docxFile has been sent <FileCreatedEvent: event_type=created, src_path='C:\\inetpub\\wwwroot\\utkuploads\\4321-1705055677609-GENERATED-REPORT.docx', is_directory=False><FileCreatedEvent: event_type=created, src_path='C:\\inetpub\\wwwroot\\utkuploads\\4321-1705055683658-GENERATED-REPORT.docx', is_directory=False> C:\inetpub\wwwroot\utkuploads\4321-1705055683658-GENERATED-REPORT.docxFile has been sent <FileCreatedEvent: event_type=created, src_path='C:\\inetpub\\wwwroot\\utkuploads\\4321-1705055683658-GENERATED-REPORT.docx', is_directory=False><FileCreatedEvent: event_type=created, src_path='C:\\inetpub\\wwwroot\\utkuploads\\4321-1705055733693.docx', is_directory=False> C:\inetpub\wwwroot\utkuploads\4321-1705055733693.docxFile has been sent <FileCreatedEvent: event_type=created, src_path='C:\\inetpub\\wwwroot\\utkuploads\\4321-1705055733693.docx', is_directory=False>Then i hit enter and this conversions happend:('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055683658.docx') STARTED('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055683658-GENERATED-REPORT.docx') STARTED('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055733693.docx') STARTED('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055733693-GENERATED-REPORT.docx') STARTED('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055677609.docx') STARTED('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055677609-GENERATED-REPORT.docx') STARTED('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055683658-GENERATED-REPORT.docx') FINISHED('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055683658.docx') FINISHED('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055733693.docx') FINISHED('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055733693-GENERATED-REPORT.docx') FINISHED('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055677609.docx') FINISHED('C:\\inetpub\\wwwroot\\utkuploads', '4321-1705055677609-GENERATED-REPORT.docx') FINISHED