Quantcast
Viewing all articles
Browse latest Browse all 14069

Remove the garbage words from the pdf

I am extracting the pdf to text using python and libraries like, fitz, pdfreader and so on. But in my pdf, there are some schematics and words I do not need on it.

Here is an example.

When extracting the text, the words of the schematics are also included, but I do not want those words to appeare. Because if the image can be extrated the text in the images is not meaninful.

I could not come up with a strategy to delete these useless words from the pdf.

import fitzfrom io import BytesIOclass DeleteGarbage(object):    def __init__(self, max_table_area=1.5):        self.max_table_area = max_table_area    def process(self, context):'''extract page content and does basic filtering using fitz'''        for page_number, page in enumerate(context["fitz"]):            if page_number != 2:                continue            area_of_page = page.rect.width * page.rect.height            paths = page.get_drawings()  # extract existing drawings            for path in paths:                for item in path["items"]:                    if item[0] == "l":  # line                        rect = [item[1][0], item[1][1], item[2][0], item[2][1]]                        if self.check_if_not_table(rect, page_number, context['content']['pages'][page_number - 1]['tables']):                            rect = [item[1][0] - 10, item[1][1] - 10, item[2][0] + 10, item[2][1] + 10]                            white = (1, 1, 1)                            black = (0, 0, 0)                            page.add_redact_annot(rect, f"", align=fitz.TEXT_ALIGN_CENTER, fill=white, text_color=white)                    elif item[0] == "re":  # rectangle                        rect = item[1]                        if rect.get_area() < area_of_page / self.max_table_area and self.check_if_not_table(rect, page_number, context['content']['pages'][page_number - 1]['tables']):                            white = (1, 1, 1)                            black = (0, 0, 0)                            page.add_redact_annot(                                [rect[0] - 10, rect[1] - 10, rect[2] + 10, rect[3] + 10],                                f"",                                align=fitz.TEXT_ALIGN_CENTER,                                fill=white,                                text_color=white                            )            page.apply_redactions()        return context    def check_if_not_table(self, rect, page_number, tables):        for table_coordination in tables['coordination']:            if table_coordination[0] - 10 < rect[0] and table_coordination[1] - 10 < rect[1] and table_coordination[2] + 10 > rect[2] and table_coordination[3] + 10 > rect[3]:                return False        return True

Viewing all articles
Browse latest Browse all 14069

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>