소목표: tensorflow-hangul-recognition/hangul_model.py at master · IBM/tensorflow-hangul-recognition (github.com) 에 제시된 소스코드 이해하기
Preliminary: image generator 파일 이해해보기
#!/usr/bin/env python
import argparse
import glob
import io
import os
import random
import numpy
from PIL import Image, ImageFont, ImageDraw
from scipy.ndimage.interpolation import map_coordinates
from scipy.ndimage.filters import gaussian_filter
SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
# Default data paths.
DEFAULT_LABEL_FILE = os.path.join(SCRIPT_PATH,
'../labels/2350-common-hangul.txt')
DEFAULT_FONTS_DIR = os.path.join(SCRIPT_PATH, '../fonts')
DEFAULT_OUTPUT_DIR = os.path.join(SCRIPT_PATH, '../image-data')
# Number of random distortion images to generate per font and character.
DISTORTION_COUNT = 3
# Width and height of the resulting image.
IMAGE_WIDTH = 64
IMAGE_HEIGHT = 64
def generate_hangul_images(label_file, fonts_dir, output_dir):
"""Generate Hangul image files.
This will take in the passed in labels file and will generate several
images using the font files provided in the font directory. The font
directory is expected to be populated with *.ttf (True Type Font) files.
The generated images will be stored in the given output directory. Image
paths will have their corresponding labels listed in a CSV file.
"""
with io.open(label_file, 'r', encoding='utf-8') as f:
labels = f.read().splitlines()
image_dir = os.path.join(output_dir, 'hangul-images')
if not os.path.exists(image_dir):
os.makedirs(os.path.join(image_dir))
# Get a list of the fonts.
fonts = glob.glob(os.path.join(fonts_dir, '*.ttf'))
labels_csv = io.open(os.path.join(output_dir, 'labels-map.csv'), 'w',
encoding='utf-8')
total_count = 0
prev_count = 0
for character in labels:
# Print image count roughly every 5000 images.
if total_count - prev_count > 5000:
prev_count = total_count
print('{} images generated...'.format(total_count))
for font in fonts:
total_count += 1
image = Image.new('L', (IMAGE_WIDTH, IMAGE_HEIGHT), color=0)
font = ImageFont.truetype(font, 48)
drawing = ImageDraw.Draw(image)
w, h = drawing.textsize(character, font=font)
drawing.text(
((IMAGE_WIDTH-w)/2, (IMAGE_HEIGHT-h)/2),
character,
fill=(255),
font=font
)
file_string = 'hangul_{}.jpeg'.format(total_count)
file_path = os.path.join(image_dir, file_string)
image.save(file_path, 'JPEG')
labels_csv.write(u'{},{}\n'.format(file_path, character))
for i in range(DISTORTION_COUNT):
total_count += 1
file_string = 'hangul_{}.jpeg'.format(total_count)
file_path = os.path.join(image_dir, file_string)
arr = numpy.array(image)
distorted_array = elastic_distort(
arr, alpha=random.randint(30, 36),
sigma=random.randint(5, 6)
)
distorted_image = Image.fromarray(distorted_array)
distorted_image.save(file_path, 'JPEG')
labels_csv.write(u'{},{}\n'.format(file_path, character))
print('Finished generating {} images.'.format(total_count))
labels_csv.close()
def elastic_distort(image, alpha, sigma):
"""Perform elastic distortion on an image.
Here, alpha refers to the scaling factor that controls the intensity of the
deformation. The sigma variable refers to the Gaussian filter standard
deviation.
"""
random_state = numpy.random.RandomState(None)
shape = image.shape
dx = gaussian_filter(
(random_state.rand(*shape) * 2 - 1),
sigma, mode="constant"
) * alpha
dy = gaussian_filter(
(random_state.rand(*shape) * 2 - 1),
sigma, mode="constant"
) * alpha
x, y = numpy.meshgrid(numpy.arange(shape[0]), numpy.arange(shape[1]))
indices = numpy.reshape(y+dy, (-1, 1)), numpy.reshape(x+dx, (-1, 1))
return map_coordinates(image, indices, order=1).reshape(shape)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--label-file', type=str, dest='label_file',
default=DEFAULT_LABEL_FILE,
help='File containing newline delimited labels.')
parser.add_argument('--font-dir', type=str, dest='fonts_dir',
default=DEFAULT_FONTS_DIR,
help='Directory of ttf fonts to use.')
parser.add_argument('--output-dir', type=str, dest='output_dir',
default=DEFAULT_OUTPUT_DIR,
help='Output directory to store generated images and '
'label CSV file.')
args = parser.parse_args()
generate_hangul_images(args.label_file, args.fonts_dir, args.output_dir)
1. argparse 뭐임?
Argparse 자습서 — Python 3.9.3 문서
파이썬 argparse 공식문서 → 이해 안 되게 작성되어있음, 무쓸모
Python argparse 사용법 (greeksharifa.github.io)
좀 이해가 잘 되게 맥락도 주어지고 좋음 굿굿
argparse → 터미널 같은 명령창에서 실행하는 CLI 같은 프로그램에 필요한 라이브러리임.
2. io는 뭐임?
Python IO Module: The Complete Practical Reference - AskPython
This module is quite useful when you want to perform file-related I/O operations (eg. file reading/writing)
While you can use the normal read() and write() methods to read/write to a file, this module gives us a lot more flexibility regarding these operations.
2-1. 그럼 with file as f 에서 scope은?
오케이, 이 정도면 image generator 코드 이해 완료
Main code
#!/usr/bin/env python
import argparse
import io
import os
import tensorflow as tf
from tensorflow.python.tools import freeze_graph
from tensorflow.python.tools import optimize_for_inference_lib
SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
# Default paths.
DEFAULT_LABEL_FILE = os.path.join(SCRIPT_PATH,
'./labels/2350-common-hangul.txt')
DEFAULT_TFRECORDS_DIR = os.path.join(SCRIPT_PATH, 'tfrecords-output')
DEFAULT_OUTPUT_DIR = os.path.join(SCRIPT_PATH, 'saved-model')
MODEL_NAME = 'hangul_tensorflow'
IMAGE_WIDTH = 64
IMAGE_HEIGHT = 64
DEFAULT_NUM_EPOCHS = 15
BATCH_SIZE = 100
# This will be determined by the number of entries in the given label file.
num_classes = 2350
def _parse_function(example):
features = tf.parse_single_example(
example,
features={
'image/class/label': tf.FixedLenFeature([], tf.int64),
'image/encoded': tf.FixedLenFeature([], dtype=tf.string,
default_value='')
})
label = features['image/class/label']
image_encoded = features['image/encoded']
# Decode the JPEG.
image = tf.image.decode_jpeg(image_encoded, channels=1)
image = tf.image.convert_image_dtype(image, dtype=tf.float32)
image = tf.reshape(image, [IMAGE_WIDTH*IMAGE_HEIGHT])
# Represent the label as a one hot vector.
label = tf.stack(tf.one_hot(label, num_classes))
return image, label
def export_model(model_output_dir, input_node_names, output_node_name):
"""Export the model so we can use it later.
This will create two Protocol Buffer files in the model output directory.
These files represent a serialized version of our model with all the
learned weights and biases. One of the ProtoBuf files is a version
optimized for inference-only usage.
"""
name_base = os.path.join(model_output_dir, MODEL_NAME)
frozen_graph_file = os.path.join(model_output_dir,
'frozen_' + MODEL_NAME + '.pb')
freeze_graph.freeze_graph(
name_base + '.pbtxt', None, False, name_base + '.chkp',
output_node_name, "save/restore_all", "save/Const:0",
frozen_graph_file, True, ""
)
input_graph_def = tf.GraphDef()
with tf.gfile.Open(frozen_graph_file, "rb") as f:
input_graph_def.ParseFromString(f.read())
output_graph_def = optimize_for_inference_lib.optimize_for_inference(
input_graph_def, input_node_names, [output_node_name],
tf.float32.as_datatype_enum)
optimized_graph_file = os.path.join(model_output_dir,
'optimized_' + MODEL_NAME + '.pb')
with tf.gfile.GFile(optimized_graph_file, "wb") as f:
f.write(output_graph_def.SerializeToString())
print("Inference optimized graph saved at: " + optimized_graph_file)
def weight_variable(shape):
"""Generates a weight variable of a given shape."""
initial = tf.random.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial, name='weight')
def bias_variable(shape):
"""Generates a bias variable of a given shape."""
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial, name='bias')
def main(label_file, tfrecords_dir, model_output_dir, num_train_epochs):
"""Perform graph definition and model training.
Here we will first create our input pipeline for reading in TFRecords
files and producing random batches of images and labels.
Next, a convolutional neural network is defined, and training is performed.
After training, the model is exported to be used in applications.
"""
global num_classes
labels = io.open(label_file, 'r', encoding='utf-8').read().splitlines()
num_classes = len(labels)
# Define names so we can later reference specific nodes for when we use
# the model for inference later.
input_node_name = 'input'
keep_prob_node_name = 'keep_prob'
output_node_name = 'output'
if not os.path.exists(model_output_dir):
os.makedirs(model_output_dir)
print('Processing data...')
tf_record_pattern = os.path.join(tfrecords_dir, '%s-*' % 'train')
train_data_files = tf.gfile.Glob(tf_record_pattern)
tf_record_pattern = os.path.join(tfrecords_dir, '%s-*' % 'test')
test_data_files = tf.gfile.Glob(tf_record_pattern)
# Create training dataset input pipeline.
train_dataset = tf.data.TFRecordDataset(train_data_files) \
.map(_parse_function) \
.shuffle(1000) \
.repeat(num_train_epochs) \
.batch(BATCH_SIZE) \
.prefetch(1)
# Create the model!
# Placeholder to feed in image data.
x = tf.placeholder(tf.float32, [None, IMAGE_WIDTH*IMAGE_HEIGHT],
name=input_node_name)
# Placeholder to feed in label data. Labels are represented as one_hot
# vectors.
y_ = tf.placeholder(tf.float32, [None, num_classes])
# Reshape the image back into two dimensions so we can perform convolution.
x_image = tf.reshape(x, [-1, IMAGE_WIDTH, IMAGE_HEIGHT, 1])
# First convolutional layer. 32 feature maps.
W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])
x_conv1 = tf.nn.conv2d(x_image, W_conv1, strides=[1, 1, 1, 1],
padding='SAME')
h_conv1 = tf.nn.relu(x_conv1 + b_conv1)
# Max-pooling.
h_pool1 = tf.nn.max_pool(h_conv1, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME')
# Second convolutional layer. 64 feature maps.
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])
x_conv2 = tf.nn.conv2d(h_pool1, W_conv2, strides=[1, 1, 1, 1],
padding='SAME')
h_conv2 = tf.nn.relu(x_conv2 + b_conv2)
h_pool2 = tf.nn.max_pool(h_conv2, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME')
# Third convolutional layer. 128 feature maps.
W_conv3 = weight_variable([3, 3, 64, 128])
b_conv3 = bias_variable([128])
x_conv3 = tf.nn.conv2d(h_pool2, W_conv3, strides=[1, 1, 1, 1],
padding='SAME')
h_conv3 = tf.nn.relu(x_conv3 + b_conv3)
h_pool3 = tf.nn.max_pool(h_conv3, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME')
# Fully connected layer. Here we choose to have 1024 neurons in this layer.
h_pool_flat = tf.reshape(h_pool3, [-1, 8*8*128])
W_fc1 = weight_variable([8*8*128, 1024])
b_fc1 = bias_variable([1024])
h_fc1 = tf.nn.relu(tf.matmul(h_pool_flat, W_fc1) + b_fc1)
# Dropout layer. This helps fight overfitting.
keep_prob = tf.placeholder(tf.float32, name=keep_prob_node_name)
h_fc1_drop = tf.nn.dropout(h_fc1, rate=1-keep_prob)
# Classification layer.
W_fc2 = weight_variable([1024, num_classes])
b_fc2 = bias_variable([num_classes])
y = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
# This isn't used for training, but for when using the saved model.
tf.nn.softmax(y, name=output_node_name)
# Define our loss.
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits_v2(
labels=tf.stop_gradient(y_),
logits=y
)
)
# Define our optimizer for minimizing our loss. Here we choose a learning
# rate of 0.0001 with AdamOptimizer. This utilizes someting
# called the Adam algorithm, and utilizes adaptive learning rates and
# momentum to get past saddle points.
train_step = tf.train.AdamOptimizer(0.0001).minimize(cross_entropy)
# Define accuracy.
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
correct_prediction = tf.cast(correct_prediction, tf.float32)
accuracy = tf.reduce_mean(correct_prediction)
saver = tf.train.Saver()
with tf.Session() as sess:
# Initialize the variables.
sess.run(tf.global_variables_initializer())
checkpoint_file = os.path.join(model_output_dir, MODEL_NAME + '.chkp')
# Save the graph definition to a file.
tf.train.write_graph(sess.graph_def, model_output_dir,
MODEL_NAME + '.pbtxt', True)
try:
iterator = train_dataset.make_one_shot_iterator()
batch = iterator.get_next()
step = 0
while True:
# Get a batch of images and their corresponding labels.
train_images, train_labels = sess.run(batch)
# Perform the training step, feeding in the batches.
sess.run(train_step, feed_dict={x: train_images,
y_: train_labels,
keep_prob: 0.5})
if step % 100 == 0:
train_accuracy = sess.run(
accuracy,
feed_dict={x: train_images, y_: train_labels,
keep_prob: 1.0}
)
print("Step %d, Training Accuracy %g" %
(step, float(train_accuracy)))
# Every 10,000 iterations, we save a checkpoint of the model.
if step % 10000 == 0:
saver.save(sess, checkpoint_file, global_step=step)
step += 1
except tf.errors.OutOfRangeError:
pass
# Save a checkpoint after training has completed.
saver.save(sess, checkpoint_file)
# See how model did by running the testing set through the model.
print('Testing model...')
# Create testing dataset input pipeline.
test_dataset = tf.data.TFRecordDataset(test_data_files) \
.map(_parse_function) \
.batch(BATCH_SIZE) \
.prefetch(1)
# Define a different tensor operation for summing the correct
# predictions.
accuracy2 = tf.reduce_sum(correct_prediction)
total_correct_preds = 0
total_preds = 0
try:
iterator = test_dataset.make_one_shot_iterator()
batch = iterator.get_next()
while True:
test_images, test_labels = sess.run(batch)
acc = sess.run(accuracy2, feed_dict={x: test_images,
y_: test_labels,
keep_prob: 1.0})
total_preds += len(test_images)
total_correct_preds += acc
except tf.errors.OutOfRangeError:
pass
test_accuracy = total_correct_preds/total_preds
print("Testing Accuracy {}".format(test_accuracy))
export_model(model_output_dir, [input_node_name, keep_prob_node_name],
output_node_name)
sess.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--label-file', type=str, dest='label_file',
default=DEFAULT_LABEL_FILE,
help='File containing newline delimited labels.')
parser.add_argument('--tfrecords-dir', type=str, dest='tfrecords_dir',
default=DEFAULT_TFRECORDS_DIR,
help='Directory of TFRecords files.')
parser.add_argument('--output-dir', type=str, dest='output_dir',
default=DEFAULT_OUTPUT_DIR,
help='Output directory to store saved model files.')
parser.add_argument('--num-train-epochs', type=int,
dest='num_train_epochs',
default=DEFAULT_NUM_EPOCHS,
help='Number of times to iterate over all of the '
'training data.')
args = parser.parse_args()
main(args.label_file, args.tfrecords_dir,
args.output_dir, args.num_train_epochs)
1. __file__ 의미
이는 현재 수행중인 코드를 담고 있는 파일의 위치한 Path
2. tfrecord 파일이란?
3. Object detection modeling
이거 링크 내용이 아주 실한데? 꼭 정독해보길.
R-CNN object detection with Keras, TensorFlow, and Deep Learning - PyImageSearch
여기서부터는 Scene Text Recognition 자료조사
1. Google tesseract api library
How to Extract Text from Images with Python | by Costas Andreou | Towards Data Science
How to Extract Text from Images with Python
Learn to extract text from images in 3 lines of codes
towardsdatascience.com
Pillow + Leptonica
# cmd
# pip install pytesseract
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'
# tesseract가 있는 경로
print(pytesseract.image_to_string(r'D:\examplepdf2image.png'))
# 이미지 있는 경로
Input: 이미지 파일
Output: string text
2. OpenCV로 읽기
Text extraction from image using OpenCV and OCR Python (etutorialspoint.com)
Text extraction from image using OpenCV and OCR Python
www.etutorialspoint.com
# import modules
import cv2
import pytesseract
# read image
img = cv2.imread('quotes.png')
# set configurations
config = ('-l eng --oem 1 --psm 3')
# Convert the image to gray scale
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# OTSU threshold performing
ret, threshimg = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY_INV)
# Specifying kernel size and structure shape.
rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (18, 18))
# Appplying dilation on the threshold image
dilation = cv2.dilate(threshimg, rect_kernel, iterations = 1)
# getting contours
img_contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_NONE)
# Loop over contours and crop and extract the text file
for cnt in img_contours:
x, y, w, h = cv2.boundingRect(cnt)
# Drawing a rectangle
rect = cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
# Cropping the text block
cropped_img = img[y:y + h, x:x + w]
# Open the text file in append mode
file = open("recognized.txt", "a")
# Applying tesseract OCR on the cropped image
text = pytesseract.image_to_string(cropped_img)
# Appending the text into file
file.write(text)
file.write("\n")
# Close the file
file.close
[55편] 머신러닝 기초2 - kNN을 이용하여 손글씨 인식하기 : 네이버 블로그 (naver.com)
[55편] 머신러닝 기초2 - kNN을 이용하여 손글씨 인식하기
이미지 프로세싱 & 컴퓨터 비전OpenCV-Python 강좌 55편 : 머신러닝 기초2 - kNN을 이용한 손글...
blog.naver.com
딥러닝과 OpenCV를 활용해 사진 속 글자 검출하기 (naver.com)
(2017)
3. Heuristic-based approach & EAST
Detecting machine-readable zones in passport images - PyImageSearch
Detecting machine-readable zones in passport images - PyImageSearch
Learn how to use OpenCV and Python to detect and extract the machine-readable zone from Type 1 and Type 3 passport images.
www.pyimagesearch.com
↑ (2015년도) 여권에 있는 글자 검출하는 방식인데, Heuristic-based approach임 (연역적인 방식 as opposed to 귀납적)
사용한 기법: thresholding, morphological operations, and contour properties
# detect_mrz.py 여권 사진에서 MRZ 구역 찾기
# import the necessary packages
from imutils import paths
import numpy as np
import argparse
import imutils
import cv2
# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--images", required=True, help="path to images directory")
args = vars(ap.parse_args())
# initialize a rectangular and square structuring kernel
rectKernel = cv2.getStructuringElement(cv2.MORPH_RECT, (13, 5)) # 가로로 긴 직사각형
sqKernel = cv2.getStructuringElement(cv2.MORPH_RECT, (21, 21))
# loop over the input image paths
for imagePath in paths.list_images(args["images"]):
# load the image, resize it, and convert it to grayscale
image = cv2.imread(imagePath)
image = imutils.resize(image, height=600)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# smooth the image using a 3x3 Gaussian, then apply the blackhat
# morphological operator to find dark regions on a light background
gray = cv2.GaussianBlur(gray, (3, 3), 0)
blackhat = cv2.morphologyEx(gray, cv2.MORPH_BLACKHAT, rectKernel)
# 대부분 하얀 배경이니까 blackhat이 적절
# compute the Scharr gradient of the blackhat image and scale the
# result into the range [0, 255]
gradX = cv2.Sobel(blackhat, ddepth=cv2.CV_32F, dx=1, dy=0, ksize=-1)
gradX = np.absolute(gradX)
(minVal, maxVal) = (np.min(gradX), np.max(gradX))
gradX = (255 * ((gradX - minVal) / (maxVal - minVal))).astype("uint8")
# apply a closing operation using the rectangular kernel to close
# gaps in between letters -- then apply Otsu's thresholding method
gradX = cv2.morphologyEx(gradX, cv2.MORPH_CLOSE, rectKernel)
thresh = cv2.threshold(gradX, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
###### 각 글의 줄 구역을 표시하고 싶다면 여기까지만 #####
# perform another closing operation, this time using the square
# kernel to close gaps between lines of the MRZ, then perform a
# series of erosions to break apart connected components
###### 한 뭉텅이로 만들어 주는 코드 ######
thresh = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, sqKernel)
# during thresholding, it's possible that border pixels were
# included in the thresholding, so let's set 5% of the left and
# right borders to zero
##### 오른쪽 왼쪽 5% 깔끔하게 해주기 #####
p = int(image.shape[1] * 0.05)
thresh[:, 0:p] = 0
thresh[:, image.shape[1] - p:] = 0
thresh = cv2.erode(thresh, None, iterations=4)
# find contours in the thresholded image and sort them by their size
cnts = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE)
cnts = imutils.grab_contours(cnts)
cnts = sorted(cnts, key=cv2.contourArea, reverse=True)
# loop over the contours
for c in cnts:
# compute the bounding box of the contour and use the contour to
# compute the aspect ratio and coverage ratio of the bounding box
# width to the width of the image
(x, y, w, h) = cv2.boundingRect(c)
ar = w / float(h)
crWidth = w / float(gray.shape[1])
# check to see if the aspect ratio and coverage width are within
# acceptable criteria
if ar > 5 and crWidth > 0.75:
# pad the bounding box since we applied erosions and now need
# to re-grow it
pX = int((x + w) * 0.03)
pY = int((y + h) * 0.03)
(x, y) = (x - pX, y - pY)
(w, h) = (w + (pX * 2), h + (pY * 2))
# extract the ROI from the image and draw a bounding box
# surrounding the MRZ
roi = image[y:y + h, x:x + w].copy()
cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
break
# show the output images
cv2.imshow("Image", image)
cv2.imshow("ROI", roi)
cv2.waitKey(0)
Histogram of Oriented Gradients and Object Detection - PyImageSearch
단계 1: 훈련하고 싶은 대상을 Sample P 만큼 구성하고, 이 표본으로부터 HOG descriptors를 추출.
단계 2: 훈련하고 싶은 대상이 포함되지 않은 Sample N 만큼 구성하고, 마찬가지로 HOG descriptors를 추출. In practics N >> P.
단계 3: P와 N에 대해서 Linear Support Vector Machine을 훈련시키기.
단계 4: Hard-negative mining 적용. (sliding window technique)
단계 5: 전단계에서 false-positive로 검출된 표본들을 각 확률별로 정리하고 모델을 이 표본들에 대해서 재훈련
단계 6: 테스트 데이터에 적용.
Histogram of Oriented Gradients and Object Detection - PyImageSearch
Detecting objects in images using the Histogram of Oriented Gradients descriptor can be broken down into 6 steps. In this post, I'll review each step.
www.pyimagesearch.com
OpenCV Text Detection (EAST text detector) - PyImageSearch
OpenCV Text Detection (EAST text detector) - PyImageSearch
In this tutorial you will learn how to use OpenCV to detect text in images and video, including using OpenCV's EAST text detector for natural scene text detection.
www.pyimagesearch.com
Important: The EAST text requires that your input image dimensions be multiples of 32, so if you choose to adjust your --width and --height values, make sure they are multiples of 32!
# USAGE
# python text_detection.py --image images/lebron_james.jpg --east frozen_east_text_detection.pb
# import the necessary packages
from imutils.object_detection import non_max_suppression
import numpy as np
import argparse
import time
import cv2
# construct the argument parser and parse the arguments
# 콘솔창에서 값을 입력 받아 넣음
# ap = argparse.ArgumentParser()
# ap.add_argument("-i", "--image", type=str, help="path to input image")
# ap.add_argument("-east", "--east", type=str,help="path to input EAST text detector")
# ap.add_argument("-c", "--min-confidence", type=float, default=0.5, help="minimum probability required to inspect a region")
# ap.add_argument("-w", "--width", type=int, default=320, help="resized image width (should be multiple of 32)")
# ap.add_argument("-e", "--height", type=int, default=320, help="resized image height (should be multiple of 32)")
# args = vars(ap.parse_args())
# load the input image and grab the image dimensions
# image = cv2.imread(args["image"])
image = cv2.imread("D:/2020/hycheck/sample3.jpg")
orig = image.copy()
(H, W) = image.shape[:2]
# set the new width and height and then determine the ratio in change
# for both the width and height
# 사이즈
# (newW, newH) = (args["width"], args["height"])
(newW, newH) = (320, 320)
rW = W / float(newW)
rH = H / float(newH)
# resize the image and grab the new image dimensions
image = cv2.resize(image, (newW, newH))
(H, W) = image.shape[:2]
# define the two output layer names for the EAST detector model that
# we are interested -- the first is the output probabilities and the
# second can be used to derive the bounding box coordinates of text
layerNames = [
"feature_fusion/Conv_7/Sigmoid",
"feature_fusion/concat_3"]
# load the pre-trained EAST text detector
print("[INFO] loading EAST text detector...")
# 모델 입력
# net = cv2.dnn.readNet(args["east"])
net = cv2.dnn.readNet("D:/2020/opencv-text-detection/frozen_east_text_detection.pb")
# construct a blob from the image and then perform a forward pass of
# the model to obtain the two output layer sets
blob = cv2.dnn.blobFromImage(image, 1.0, (W, H), (123.68, 116.78, 103.94), swapRB=True, crop=False)
start = time.time()
net.setInput(blob)
(scores, geometry) = net.forward(layerNames)
end = time.time()
# show timing information on text prediction
print("[INFO] text detection took {:.6f} seconds".format(end - start))
# grab the number of rows and columns from the scores volume, then
# initialize our set of bounding box rectangles and corresponding
# confidence scores
(numRows, numCols) = scores.shape[2:4]
rects = []
confidences = []
# loop over the number of rows
for y in range(0, numRows):
# extract the scores (probabilities), followed by the geometrical
# data used to derive potential bounding box coordinates that
# surround text
scoresData = scores[0, 0, y]
xData0 = geometry[0, 0, y]
xData1 = geometry[0, 1, y]
xData2 = geometry[0, 2, y]
xData3 = geometry[0, 3, y]
anglesData = geometry[0, 4, y]
# loop over the number of columns
for x in range(0, numCols):
# if our score does not have sufficient probability, ignore it
# if scoresData[x] < args["min_confidence"]:
if scoresData[x] < 0.5:
continue
# compute the offset factor as our resulting feature maps will
# be 4x smaller than the input image
(offsetX, offsetY) = (x * 4.0, y * 4.0)
# extract the rotation angle for the prediction and then
# compute the sin and cosine
angle = anglesData[x]
cos = np.cos(angle)
sin = np.sin(angle)
# use the geometry volume to derive the width and height of
# the bounding box
h = xData0[x] + xData2[x]
w = xData1[x] + xData3[x]
# compute both the starting and ending (x, y)-coordinates for
# the text prediction bounding box
endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
startX = int(endX - w)
startY = int(endY - h)
# add the bounding box coordinates and probability score to
# our respective lists
rects.append((startX, startY, endX, endY))
confidences.append(scoresData[x])
# apply non-maxima suppression to suppress weak, overlapping bounding
# boxes
boxes = non_max_suppression(np.array(rects), probs=confidences)
# loop over the bounding boxes
for (startX, startY, endX, endY) in boxes:
# scale the bounding box coordinates based on the respective
# ratios
startX = int(startX * rW)
startY = int(startY * rH)
endX = int(endX * rW)
endY = int(endY * rH)
# draw the bounding box on the image
cv2.rectangle(orig, (startX, startY), (endX, endY), (0, 255, 0), 2)
# show the output image
cv2.imshow("Text Detection", orig)
cv2.waitKey(0)
argman/EAST: A tensorflow implementation of EAST text detector (github.com)
[1704.03155] EAST: An Efficient and Accurate Scene Text Detector (arxiv.org)
EAST 텍스트 검출기 구조
4. 명함 글자 검출
5. Yolo
YOLO v4 리뷰 : Optimal Speed and Accuracy of Object Detection (공부중) (tistory.com)
YOLO v4 리뷰 : Optimal Speed and Accuracy of Object Detection (공부중)
작성자 : 한양대학원 융합로봇시스템학과 석사과정 유승환 오늘은 저번달에 나온 따끈 따끈한 YOLO의 새로운 버전, YOLO v4에 대해 공부해보겠다! 아카이브 기준으로 2020년 4월 23일에 YOLO v4 논문이
ropiens.tistory.com
YOLO object detection with OpenCV - PyImageSearch
YOLO object detection with OpenCV - PyImageSearch
In this guide you will learn how to use the YOLO object detector to detect objects in images and video using OpenCV, Python, and Deep Learning.
www.pyimagesearch.com
(2018)
tensorflow - Can we use Yolo to detect and recognize text in a image - Stack Overflow
Object Detection on Newspaper images using YoloV3 | by Vaibhav Birla | Towards Data Science
#011 TF YOLO V3 Object Detection in TensorFlow 2.0 (datahacker.rs)
(이론 설명 + 텐서플로 구현)
(Tech report that is HILARIOUS!)
6. Mask R-CNN
Mask R-CNN with OpenCV - PyImageSearch
(2018) 픽셀 단위의 마스크를 자동적으로 segment해줌.
Mask R-CNN (FAIR, 2018); RoIAlign (preserves exact spatial location); decouple mask and class prediction; instance-level recognition; parallel of prediction and class labeling
Splash of Color: Instance Segmentation with Mask R-CNN and TensorFlow
Explained by building a color splash filter
engineering.matterport.com
R-CNN object detection with Keras, TensorFlow, and Deep Learning - PyImageSearch
7. 논문 및 특허 + Github
↑요약: 모양이 변형된 간판 한글 읽기;
예시로 '닭'을 읽기 위해 5,383개 한글 이미지 데이터셋을 train:test = 8:2 비율로 Mask R-CNN, skeleton extraction모델에 훈련. 결과: 92.65% 정확도.
딥러닝을 활용한 한글문장 OCR 프로젝트, HCLT 2019
2019년 4월부터 10월까지 6개월동안 딥러닝 개인프로젝트를 진행했고 그 과정에서 주제선정부터 논문작성까지 배우고 느낀것들을 정리해보려 한다
medium.com
8. 후처리
jeongukjae.github.io/posts/korean-spacing-model/
한국어 띄어쓰기 모델
hong-yp-ml-records.tistory.com/99
[ADV 프로젝트] 텍스트 전처리 Part2. 맞춤법 교정 (부산대 맞춤법 교정기, py-hanspell)
이번 포스트는 텍스트 전처리 Part1의 띄어쓰기 교정에 이어서 맞춤법 교정 과정이다. 수집한 데이터가 인터넷 상의 댓글이기 때문에 역시나 맞춤법도 교정이 필수라고 생각했다. 아래의 문장만
hong-yp-ml-records.tistory.com
맞춤법 검사 모델
9. 폰트 모음
[2] Selective Search (0) | 2021.04.12 |
---|---|
[1] CNN Image Classifier to Object Detector (0) | 2021.04.12 |
matplot RGB vs opencv BGR vs caffe images (0) | 2021.04.09 |
OpenCV fundamentals (0) | 2021.04.09 |
Object Detection (R-CNN) (0) | 2021.03.26 |