Quantcast
Channel: Active questions tagged python - Stack Overflow
Viewing all articles
Browse latest Browse all 23131

Python | How to translate only the text in an hwp file while keeping all images, formatting, and formatting?

$
0
0

How to translate only the text in an hwp file while keeping all images, formatting, and formatting?

I used Google Translate and olefile

After the hwp file is entered as input, only the text must be translated into English while maintaining the ease of use and formatting, and the file must be output as output through hwp.

import googletransimport osfrom glob import globimport olefileimport zlibimport structdef get_hwp_text(filename):    f = olefile.OleFileIO(filename)    dirs = f.listdir()    if ["FileHeader"] not in dirs or \       ["\x05HwpSummaryInformation"] not in dirs:        raise Exception("Not Valid HWP.")    header = f.openstream("FileHeader")    header_data = header.read()    is_compressed = (header_data[36] & 1) == 1    nums = []    for d in dirs:        if d[0] == "BodyText":            nums.append(int(d[1][len("Section"):]))    sections = ["BodyText/Section"+str(x) for x in sorted(nums)]    text = ""    for section in sections:        bodytext = f.openstream(section)        data = bodytext.read()        if is_compressed:            unpacked_data = zlib.decompress(data, -15)        else:            unpacked_data = data        section_text = ""        i = 0        size = len(unpacked_data)        while i < size:            header = struct.unpack_from("<I", unpacked_data, i)[0]            rec_type = header & 0x3ff            rec_len = (header >> 20) & 0xfff            if rec_type in [67]:                rec_data = unpacked_data[i+4:i+4+rec_len]                section_text += rec_data.decode('utf-16')                section_text += "\n"            i += 4 + rec_len        text += section_text        text += "\n"    return text# def write_hwp_text(input_path, output_path, text):#     with open(input_path, 'rb') as f:#         input_data = f.read()#     ole = olefile.OleFileIO(input_data)#     for stream in ole.listdir():#         if stream[0].startswith('PrvText') and stream[0].endswith('.txt'):#             ole.openstream(stream, 'w').write(text.encode('utf-16le'))#     with open(output_path, 'wb') as f:#         f.write(ole.get_metadata())#     # print(ole.get_metadata())#     # print(ole.getvalue())def write_txt_file(output_path, text):    output_path = output_path.replace(".hwp", ".hwp")    print(output_path)    with open(output_path, 'w') as f:        f.write(text)def translate_hwp(input_path, output_path, target_lang='en'):    text = get_hwp_text(input_path)    if len(text) > 4000:        text = text[:4000]    translator = googletrans.Translator()    translated_text = translator.translate(text, dest = 'en', src = 'auto').text    write_txt_file(output_path, translated_text)if __name__ == "__main__":    input_dir = './data/kr/*.hwp'    output_dir = './data/test'    if not os.path.exists(output_dir):        os.makedirs(output_dir)    for input_path in glob(input_dir):        print("#"*50)        print(f"input hwp file is {input_path}")        print()        basename = os.path.basename(input_path)        output_path = os.path.join(output_dir, basename)        translate_hwp(input_path, output_path)

Viewing all articles
Browse latest Browse all 23131

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>