I need to crop each column from scanned pdf. I tried lots of solution from here but none of them worked.
For example I have below image.
https://i.stack.imgur.com/YdPSG.jpg
I am using below script to remove borders because with borders getting each column is not easy for me.
import cv2SCALE = 4def show_scaled(name, img): try: h, w = img.shape except ValueError: h, w, _ = img.shape cv2.imshow(name, cv2.resize(img, (w // SCALE, h // SCALE)))image = cv2.imread('3-1.png')gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]# Remove horizontalhorizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 1))detected_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)show_scaled("horizontal detected lines", detected_lines)cnts = cv2.findContours(detected_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)cnts = cnts[0] if len(cnts) == 2 else cnts[1]for c in cnts: cv2.drawContours(image, [c], -1, (255, 255, 255), 2)# Remove verticalvertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 15))detected_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)show_scaled("vertical detected lines", detected_lines)cnts = cv2.findContours(detected_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)cnts = cnts[0] if len(cnts) == 2 else cnts[1]for c in cnts: cv2.drawContours(image, [c], -1, (255, 255, 255), 2)# show_scaled('thresh', thresh)cv2.imshow('image', image)cv2.imwrite("contours.png", image)cv2.waitKey()
Result is: https://i.stack.imgur.com/HDqt2.png
After removing borders I use below script to get each column.
import cv2SCALE = 4AREA_THRESHOLD = 427505.0 / 2def show_scaled(name, img): try: h, w = img.shape except ValueError: h, w, _ = img.shape cv2.imshow(name, cv2.resize(img, (w // SCALE, h // SCALE)))def main(): base_img = cv2.imread("contours.png") img = base_img.copy() gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) show_scaled("original", gray) # black and white, and inverted, because # white pixels are treated as objects in # contour detection thresholded = cv2.adaptiveThreshold( gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 25, 15 ) show_scaled('thresholded', thresholded) # I use a kernel that is wide enough to connect characters # but not text blocks, and tall enough to connect lines. kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 110)) closing = cv2.morphologyEx(thresholded, cv2.MORPH_CLOSE, kernel) contours, hierarchy = cv2.findContours(closing, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) show_scaled("closing", closing) for i in range(len(contours)): contour = contours[i] convex_contour = cv2.convexHull(contour) area = cv2.contourArea(convex_contour) if area > AREA_THRESHOLD: cv2.drawContours(img, [convex_contour], -1, (255, 0, 0), 3) [x, y, w, h] = cv2.boundingRect(contour) cropped_image = base_img[y:y + h, x:x + w] res = cv2.imwrite("/Users/safa.yildirim@propertyfinder.ae/Desktop/personal/neutech-image-processing""/cropped-images/column" + str(i) +".png", cropped_image) print(res) show_scaled("contours", img) cv2.imwrite("/tmp/contours.png", img) cv2.waitKey()if __name__ == '__main__': main()
Result is: https://i.stack.imgur.com/R0xWt.jpg
Now there are two problem about what I did. First one is "KULLANMA" text at right upper corner should also removed. I cant basically crop image because it can be smaller or bigger so pixel values between that text and below text can be changed.
https://i.stack.imgur.com/aFkxb.png
Second in second script, opencv also draw contour for header section like in image below.
https://i.stack.imgur.com/W1GmA.png
How can I get only columns images and remove right upper text from image?