weinman
diff --git a/‎AUTHOR
+2 b/‎AUTHOR
+2
diff --git a/‎Makefile
+30 b/‎Makefile
+30
diff --git a/‎README.md
+87 b/‎README.md
+87
diff --git a/‎data/test/words-000.tfrecord
1.59 MB b/‎data/test/words-000.tfrecord
1.59 MB
diff --git a/‎data/train/words-000.tfrecord
12.9 MB b/‎data/train/words-000.tfrecord
12.9 MB
diff --git a/‎data/val/words-000.tfrecord
1.46 MB b/‎data/val/words-000.tfrecord
1.46 MB
diff --git a/‎src/mjsynth-tfrecord.py
+182 b/‎src/mjsynth-tfrecord.py
+182
@@ -0,0 +1,2 @@
+Jerod Weinman
+[email protected]
@@ -0,0 +1,30 @@
+all: mjsynth-download mjsynth-tfrecord train
+
+demo: train
+
+mjsynth-download: mjsynth-wget mjsynth-unpack 
+
+mjsynth-wget:
+	mkdir -p data
+	cd data ; \
+	wget http://www.robots.ox.ac.uk/~vgg/data/text/mjsynth.tar.gz
+
+mjsynth-unpack:
+	mkdir -p data/images
+# strip leading mnt/ramdisk/max/90kDICT32px/
+	tar xzvf data/mjsynth.tar.gz \
+    --strip=4 \
+    -C data/images
+
+mjsynth-tfrecord:
+	mkdir -p data/train data/val data/test 
+	cd src ; python mjsynth-tfrecord.py
+
+train:
+	cd src ; python train.py # use --help for options
+
+monitor:
+	tensorboard --logdir=data/model --port=8008
+
+test:
+	cd src ; python test.py # use --help for options
@@ -0,0 +1,87 @@
+# Overview
+
+This collection demonstrates how to construct and train a deep,
+bidirectional stacked LSTM using a CNN features as input with CTC loss
+to perform robust word recognition. The model is a straightforward
+adaptation of Shi et al.'s CRNN architecture (arXiv:1507.0571). Code
+provided downloads and trains using Jaderberg et al.'s synthetic data
+(doi: 10.1007/s11263-015-0823-z).
+
+
+
+# Structure
+
+The model as build is a hybrid of Shi et al.'s CRNN architecture
+(arXiv:1507.0571) and the VGG deep convnet, which reduces the number
+of parameters by stacking pairs of small 3x3 kernels. In addition, the
+pooling is also limited in the horizontal direction to preserve
+resolution for character recognition. There must be at least one
+horizontal element per character.
+
+Assuming one starts with a 32x32 image, the dimensions at each level
+of filtering are as follows:
+
+
+===================================================================
+Layer   Op      KrnSz  Stride(v,h) OutDim   H       W       Options
+-------------------------------------------------------------------
+1       Conv    3      1            64      30      30      valid
+2       Conv    3      1            64      30      30      same
+        Pool    2      2            64      15      15
+3       Conv    3      1           128      15      15      same
+4       Conv    3      1           128      15      15      same
+        Pool    2      2,1         128       7      14      
+5       Conv    3      1           256       7      14      same
+6       Conv    3      1           256       7      14      same
+        Pool    2      2,1         256       3      13      
+7       Conv    3      1           512       3      13      same
+8       Conv    3      1           512       3      13      same
+        Pool    3      3,1         512       1      13       
+9       LSTM                       512
+10      LSTM                       512
+
+To accelerate training, a batch normalization layer is included before
+each pooling layer and ReLU non-linearities are used throughout. Other
+model details should be easily identifiable in the code.
+
+The default training mechanism uses the ADAM optimizer with learning
+rate decay.
+
+# Training
+
+To completely train the model, you will need to download the mjsynth
+dataset, pack it into sharded tensorflow records. Then you can start
+the training process, a tensorboard monitor, and an ongoing evaluation
+thread. The individual commands are packaged in the accompanying `Makefile`.
+
+    make mjsynth-download
+    make mjsynth-tfrecord
+    make train &
+    make monitor &
+    make test
+
+To monitor training, point your web browser to the url (e.g.,
+(http://127.0.1.1:8008)) given by the Tensorboard output.
+
+Note that it may take 4-12 hours to download the complete mjsynth data
+set. A very small set (0.1%) of packaged example data is included; to
+run the small demo, skip the first two lines involving `mjsynth`.
+
+With a Geforce GTX 1080, the demo takes about 20 minutes for the
+validation character error to reach 45% (using the default
+parameters); at one hour (roughly 7000 iterations), the validation
+error is just over 20%.
+
+With the full training data, the model typically converges to around
+7% training character error and 35% word error, both varying by 2-5%.
+
+# Testing
+
+The test script streams statistics for small batches of validation (or test) data. It ouputs the label error (percentage of characters predicted incorrectly), the test loss, and the sequence error (percentage of words--entire sequences--predicted incorrectly.)
+
+# Configuration
+
+There are many command-line options to configure training
+parameters. Run `train.py` or `test.py` with the `--help` flag to see
+them or inspect the scripts. Model parameters are not command-line
+configurable and need to be edited in the code (see `model.py`).
@@ -0,0 +1,182 @@
+# CNN-LSTM-CTC-OCR
+# Copyright (C) 2017 Jerod Weinman
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import os
+import tensorflow as tf
+import math
+
+"""Each record within the TFRecord file is a serialized Example proto. 
+The Example proto contains the following fields:
+  image/encoded: string containing JPEG encoded grayscale image
+  image/height: integer, image height in pixels
+  image/width: integer, image width in pixels
+  image/filename: string containing the basename of the image file
+  image/labels: list containing the sequence labels for the image text
+  image/text: string specifying the human-readable version of the text
+"""
+
+# The list (well, string) of valid output characters
+# If any example contains a character not found here, an error will result
+# from the calls to .index in the decoder below
+out_charset="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+
+jpeg_data = tf.placeholder(dtype=tf.string)
+jpeg_decoder = tf.image.decode_jpeg(jpeg_data,channels=1)
+
+kernel_sizes = [5,5,3,3,3,3] # CNN kernels for image reduction
+
+# Minimum allowable width of image after CNN processing
+min_width = 20
+
+def calc_seq_len(image_width):
+    """Calculate sequence length of given image after CNN processing"""
+    
+    conv1_trim =  2 * (kernel_sizes[0] // 2)
+    fc6_trim = 2*(kernel_sizes[5] // 2)
+    
+    after_conv1 = image_width - conv1_trim 
+    after_pool1 = after_conv1 // 2
+    after_pool2 = after_pool1 // 2
+    after_pool4 = after_pool2 - 1 # max without stride
+    after_fc6 =  after_pool4 - fc6_trim
+    seq_len = 2*after_fc6
+    return seq_len
+
+seq_lens = [calc_seq_len(w) for w in range(1024)]
+
+def gen_data(input_base_dir, image_list_filename, output_filebase, 
+             num_shards=1000,start_shard=0):
+    """ Generate several shards worth of TFRecord data """
+    session_config = tf.ConfigProto()
+    session_config.gpu_options.allow_growth=True
+    sess = tf.Session(config=session_config)
+    image_filenames = get_image_filenames(os.path.join(input_base_dir,
+                                                       image_list_filename))
+    num_digits = math.ceil( math.log10( num_shards - 1 ))
+    shard_format = '%0'+ ('%d'%num_digits) + 'd' # Use appropriate # leading zeros
+    images_per_shard = int(math.ceil( len(image_filenames) / float(num_shards) ))
+    
+    for i in range(start_shard,num_shards):
+        start = i*images_per_shard
+        end   = (i+1)*images_per_shard
+        out_filename = output_filebase+'-'+(shard_format % i)+'.tfrecord'
+        if os.path.isfile(out_filename): # Don't recreate data if restarting
+            continue
+        print str(i),'of',str(num_shards),'[',str(start),':',str(end),']',out_filename
+        gen_shard(sess, input_base_dir, image_filenames[start:end], out_filename)
+    # Clean up writing last shard
+    start = num_shards*images_per_shard
+    out_filename = output_filebase+'-'+(shard_format % num_shards)+'.tfrecord'
+    print str(i),'of',str(num_shards),'[',str(start),':]',out_filename
+    gen_shard(sess, input_base_dir, image_filenames[start:], out_filename)
+
+    sess.close()
+
+def gen_shard(sess, input_base_dir, image_filenames, output_filename):
+    """Create a TFRecord file from a list of image filenames"""
+    writer = tf.python_io.TFRecordWriter(output_filename)
+    
+    for filename in image_filenames:
+        path_filename = os.path.join(input_base_dir,filename)
+        if os.stat(path_filename).st_size == 0:
+            print('SKIPPING',filename)
+            continue
+        try:
+            image_data,height,width = get_image(sess,path_filename)
+            text,labels = get_text_and_labels(filename)
+            if is_writable(width,text):
+                example = make_example(filename, image_data, labels, text, 
+                                       height, width)
+                writer.write(example.SerializeToString())
+            else:
+                print('SKIPPING',filename)
+        except:
+            # Some files have bogus payloads, catch and note the error, moving on
+            print('ERROR',filename)
+    writer.close()
+
+
+def get_image_filenames(image_list_filename):
+    """ Given input file, generate a list of relative filenames"""
+    filenames = []
+    with open(image_list_filename) as f:
+        for line in f:
+            # Carve out the ground truth string and file path from lines like:
+            # ./2697/6/466_MONIKER_49537.jpg 49537
+            filename = line.split(' ',1)[0][2:] # split off "./" and number
+            filenames.append(filename)
+    return filenames
+
+def get_image(sess,filename):
+    """Given path to an image file, load its data and size"""
+    with tf.gfile.FastGFile(filename, 'r') as f:
+        image_data = f.read()
+    image = sess.run(jpeg_decoder,feed_dict={jpeg_data: image_data})
+    height = image.shape[0]
+    width = image.shape[1]
+    return image_data, height, width
+
+def is_writable(image_width,text):
+    """Determine whether the CNN-processed image is longer than the string"""
+    return (image_width > min_width) and (len(text) <= seq_lens[image_width])
+    
+def get_text_and_labels(filename):
+    """ Extract the human-readable text and label sequence from image filename"""
+    # Ground truth string lines embedded within base filename between underscores
+    # 2697/6/466_MONIKER_49537.jpg --> MONIKER
+    text = os.path.basename(filename).split('_',2)[1]
+    # Transform string text to sequence of indices using charset, e.g.,
+    # MONIKER -> [12, 14, 13, 8, 10, 4, 17]
+    labels = [out_charset.index(c) for c in list(text)]
+    return text,labels
+
+def make_example(filename, image_data, labels, text, height, width):
+    """Build an Example proto for an example.
+    Args:
+    filename: string, path to an image file, e.g., '/path/to/example.JPG'
+    image_data: string, JPEG encoding of grayscale image
+    labels: integer list, identifiers for the ground truth for the network
+    text: string, unique human-readable, e.g. 'dog'
+    height: integer, image height in pixels
+    width: integer, image width in pixels
+  Returns:
+    Example proto
+  """
+    example = tf.train.Example(features=tf.train.Features(feature={
+        'image/encoded': _bytes_feature(tf.compat.as_bytes(image_data)),
+        'image/labels': _int64_feature(labels),
+        'image/height': _int64_feature([height]),
+        'image/width': _int64_feature([width]),
+        'image/filename': _bytes_feature(tf.compat.as_bytes(filename)),
+        'text/string': _bytes_feature(tf.compat.as_bytes(text)),
+        'text/length': _int64_feature([len(text)])
+    }))
+    return example
+
+def _int64_feature(values):
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=values))
+
+def _bytes_feature(values):
+    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[values]))
+
+def main(argv=None):
+    
+    gen_data('../data/images', 'annotation_train.txt', '../data/train/words')
+    gen_data('../data/images', 'annotation_val.txt',   '../data/val/words')
+    gen_data('../data/images', 'annotation_test.txt',  '../data/test/words')
+
+if __name__ == '__main__':
+    main()