How to translate only the text in an hwp file while keeping all images, formatting, and formatting?
I used Google Translate and olefile
After the hwp file is entered as input, only the text must be translated into English while maintaining the ease of use and formatting, and the file must be output as output through hwp.
import googletransimport osfrom glob import globimport olefileimport zlibimport structdef get_hwp_text(filename): f = olefile.OleFileIO(filename) dirs = f.listdir() if ["FileHeader"] not in dirs or \ ["\x05HwpSummaryInformation"] not in dirs: raise Exception("Not Valid HWP.") header = f.openstream("FileHeader") header_data = header.read() is_compressed = (header_data[36] & 1) == 1 nums = [] for d in dirs: if d[0] == "BodyText": nums.append(int(d[1][len("Section"):])) sections = ["BodyText/Section"+str(x) for x in sorted(nums)] text = "" for section in sections: bodytext = f.openstream(section) data = bodytext.read() if is_compressed: unpacked_data = zlib.decompress(data, -15) else: unpacked_data = data section_text = "" i = 0 size = len(unpacked_data) while i < size: header = struct.unpack_from("<I", unpacked_data, i)[0] rec_type = header & 0x3ff rec_len = (header >> 20) & 0xfff if rec_type in [67]: rec_data = unpacked_data[i+4:i+4+rec_len] section_text += rec_data.decode('utf-16') section_text += "\n" i += 4 + rec_len text += section_text text += "\n" return text# def write_hwp_text(input_path, output_path, text):# with open(input_path, 'rb') as f:# input_data = f.read()# ole = olefile.OleFileIO(input_data)# for stream in ole.listdir():# if stream[0].startswith('PrvText') and stream[0].endswith('.txt'):# ole.openstream(stream, 'w').write(text.encode('utf-16le'))# with open(output_path, 'wb') as f:# f.write(ole.get_metadata())# # print(ole.get_metadata())# # print(ole.getvalue())def write_txt_file(output_path, text): output_path = output_path.replace(".hwp", ".hwp") print(output_path) with open(output_path, 'w') as f: f.write(text)def translate_hwp(input_path, output_path, target_lang='en'): text = get_hwp_text(input_path) if len(text) > 4000: text = text[:4000] translator = googletrans.Translator() translated_text = translator.translate(text, dest = 'en', src = 'auto').text write_txt_file(output_path, translated_text)if __name__ == "__main__": input_dir = './data/kr/*.hwp' output_dir = './data/test' if not os.path.exists(output_dir): os.makedirs(output_dir) for input_path in glob(input_dir): print("#"*50) print(f"input hwp file is {input_path}") print() basename = os.path.basename(input_path) output_path = os.path.join(output_dir, basename) translate_hwp(input_path, output_path)