from PIL import Image
import pytesseract
import cv2
import numpy as np
from matplotlib import pyplot as plt
from pathlib import Path
pytesseract.pytesseract.tesseract_cmd = "C:/Program Files (x86)/Tesseract-OCR/tesseract.exe"
Files:
data = Path.cwd() / 'data'
img_original = Image.open(data / '0244R.jpg')
img_new = Image.open(data / '0244R_clean.jpg')
img = [img_original, img_new]
img_original
img_new
for x in img:
print(f'OCR Result: {pytesseract.image_to_string(x)}\n')
def image_smoothening(img):
ret1, th1 = cv2.threshold(img, 88, 255, cv2.THRESH_BINARY)
ret2, th2 = cv2.threshold(th1, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
blur = cv2.GaussianBlur(th2, (5, 5), 0)
ret3, th3 = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
return th3
def remove_noise_and_smooth(file_name):
img = cv2.imread(str(file_name), 0)
filtered = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 9, 41)
kernel = np.ones((1, 1), np.uint8)
opening = cv2.morphologyEx(filtered, cv2.MORPH_OPEN, kernel)
closing = cv2.morphologyEx(opening, cv2.MORPH_CLOSE, kernel)
img = image_smoothening(img)
or_image = cv2.bitwise_or(img, closing)
return or_image
These options are not used, because they do not result in successful OCR
not_used_thresh = [cv2.THRESH_BINARY_INV, cv2.THRESH_TOZERO_INV]
cv2_thresh_list = [cv2.THRESH_BINARY, cv2.THRESH_TRUNC, cv2.THRESH_TOZERO]
img1 = remove_noise_and_smooth(data / '0244R.jpg')
img2 = cv2.imread(str(data / '0244R.jpg'), 0)
for i, img in enumerate([img1, img2]):
img_type = {0: 'Preprocessed Images\n',
1: '\nUnprocessed Images\n'}
print(img_type[i])
for item in cv2_thresh_list:
print('Thresh: {}'.format(str(item)))
_, thresh = cv2.threshold(img, 127, 255, item)
plt.imshow(thresh, 'gray')
f_name = f'{str(item)}.jpg'
plt.savefig(f_name)
plt.show()
print('OCR Result: {}\n'.format(pytesseract.image_to_string(f_name)))