I am extracting the pdf to text using python and libraries like, fitz, pdfreader and so on. But in my pdf, there are some schematics and words I do not need on it.
Here is an example.
When extracting the text, the words of the schematics are also included, but I do not want those words to appeare. Because if the image can be extrated the text in the images is not meaninful.
I could not come up with a strategy to delete these useless words from the pdf.
import fitzfrom io import BytesIOclass DeleteGarbage(object): def __init__(self, max_table_area=1.5): self.max_table_area = max_table_area def process(self, context):'''extract page content and does basic filtering using fitz''' for page_number, page in enumerate(context["fitz"]): if page_number != 2: continue area_of_page = page.rect.width * page.rect.height paths = page.get_drawings() # extract existing drawings for path in paths: for item in path["items"]: if item[0] == "l": # line rect = [item[1][0], item[1][1], item[2][0], item[2][1]] if self.check_if_not_table(rect, page_number, context['content']['pages'][page_number - 1]['tables']): rect = [item[1][0] - 10, item[1][1] - 10, item[2][0] + 10, item[2][1] + 10] white = (1, 1, 1) black = (0, 0, 0) page.add_redact_annot(rect, f"", align=fitz.TEXT_ALIGN_CENTER, fill=white, text_color=white) elif item[0] == "re": # rectangle rect = item[1] if rect.get_area() < area_of_page / self.max_table_area and self.check_if_not_table(rect, page_number, context['content']['pages'][page_number - 1]['tables']): white = (1, 1, 1) black = (0, 0, 0) page.add_redact_annot( [rect[0] - 10, rect[1] - 10, rect[2] + 10, rect[3] + 10], f"", align=fitz.TEXT_ALIGN_CENTER, fill=white, text_color=white ) page.apply_redactions() return context def check_if_not_table(self, rect, page_number, tables): for table_coordination in tables['coordination']: if table_coordination[0] - 10 < rect[0] and table_coordination[1] - 10 < rect[1] and table_coordination[2] + 10 > rect[2] and table_coordination[3] + 10 > rect[3]: return False return True