oyxhust
diff --git a/Diff for: ‎README.md
+61 b/Diff for: ‎README.md
+61
diff --git a/Diff for: ‎config/__init__.py b/Diff for: ‎config/__init__.py
diff --git a/Diff for: ‎config/config.py
+32 b/Diff for: ‎config/config.py
+32
diff --git a/Diff for: ‎data/synthtext_img_size/read_size.py
+25 b/Diff for: ‎data/synthtext_img_size/read_size.py
+25
diff --git a/Diff for: ‎dataset/__init__.py b/Diff for: ‎dataset/__init__.py
diff --git a/Diff for: ‎dataset/concat_db.py
+110 b/Diff for: ‎dataset/concat_db.py
+110
@@ -0,0 +1,61 @@
+# SSD-text detection: Text Detector
+
+This is a modified SSD model for text detection.
+
+Compared to faster R-CNN, SSD is much faster. In my expriment, SSD only needs about 0.05s for each image.
+
+### Disclaimer
+This is a re-implementation of mxnet SSD. The official
+repository is available [here](https://github.com/dmlc/mxnet/tree/master/example/ssd).
+The arXiv paper is available [here](http://arxiv.org/abs/1512.02325).
+
+### Getting started
+* Build MXNet: Make sure the extra operators for this example is enabled, and please following the the official instructions [here](https://github.com/dmlc/mxnet/tree/master/example/ssd).
+
+### Train the model
+I modify the original SSD on SynthText and ICDAR. Other datasets should
+be easily supported by adding subclass derived from class `Imdb` in `dataset/imdb.py`.
+See example of `dataset/pascal_voc.py` for details.
+* Download the converted pretrained `vgg16_reduced` model [here](https://dl.dropboxusercontent.com/u/39265872/vgg16_reduced.zip), unzip `.param` and `.json` files
+into `model/` directory by default.
+
+To gain a good performance, we should train our model on SynthText which is a quite big dataset (about 40G) firstly, and then fine tune this model on ICDAR. If you want to apply this model for other applications, you can fine tune it on any dataset.
+
+* Download the SynthText dataset [here](http://www.robots.ox.ac.uk/~vgg/data/scenetext/), and extract it into `data`.
+
+Because SSD requires every image's size but SythText is too big, it will take too much time if we have to use opencv to read the images' size each time when we star training. So I use 'read_size.py' (`data/synthtext_img_size`) to creat a h5py file 'size.h5' to store the sizes of all images. You can copy this file to the extracted folder 'SynthText'.
+
+
+* Start training:
+```
+python train_synthtext.py
+```
+### Fine tune the model
+* Download the ICDAR challenge 2 dataset [here](http://rrc.cvc.uab.es/?ch=2&com=introduction), and extract it into `data`.
+
+* Start training:
+```
+python train_icdar.py --finetune N
+```
+Please replace 'N' into an integer number which depends on the save model you train on SynthText.
+
+### Try the demo
+* After training, you can try your model on test images. I give two demos here (`demo.py` and `demo_savefig.py`). `demo.py` can visualize the detection result, while `demo_savefig.py` can save the detection result as images.
+
+When running `demo_savefig.py`, please give the test images path.
+* Run `demo.py`
+```
+# play with examples:
+python demo.py --epoch 0 --images ./data/demo/test.jpg --thresh 0.5
+```
+* Check `python demo.py --help` for more options.
+
+When running `demo_savefig.py`, please give the test images folder path.
+* Run `demo_savefig.py`
+```
+# play with examples:
+python demo_savefig.py --epoch 0 --images ./data/demo/test --thresh 0.5
+```
+
+
+
@@ -0,0 +1,32 @@
+import os
+from easydict import EasyDict as edict
+from tools.rand_sampler import RandCropper, RandPadder
+
+cfg = edict()
+cfg.ROOT_DIR = os.path.join(os.path.dirname(__file__), '..')
+
+# training
+cfg.TRAIN = edict()
+cfg.TRAIN.RAND_SAMPLERS = [RandCropper(min_scale=1., max_trials=1, max_sample=1),
+    RandCropper(min_scale=.3, min_aspect_ratio=.5, max_aspect_ratio=2., min_overlap=.1),
+    RandCropper(min_scale=.3, min_aspect_ratio=.5, max_aspect_ratio=2., min_overlap=.3),
+    RandCropper(min_scale=.3, min_aspect_ratio=.5, max_aspect_ratio=2., min_overlap=.5),
+    RandCropper(min_scale=.3, min_aspect_ratio=.5, max_aspect_ratio=2., min_overlap=.7),
+    RandPadder(max_scale=2., min_aspect_ratio=.5, max_aspect_ratio=2., min_gt_scale=.05),
+    RandPadder(max_scale=3., min_aspect_ratio=.5, max_aspect_ratio=2., min_gt_scale=.05),
+    RandPadder(max_scale=4., min_aspect_ratio=.5, max_aspect_ratio=2., min_gt_scale=.05),]
+# cfg.TRAIN.RAND_SAMPLERS = []
+cfg.TRAIN.RAND_MIRROR = True
+cfg.TRAIN.INIT_SHUFFLE = True
+cfg.TRAIN.EPOCH_SHUFFLE = True # shuffle training list after each epoch
+cfg.TRAIN.RAND_SEED = None
+cfg.TRAIN.RESIZE_EPOCH = 1 # save model every N epoch
+
+
+# validation
+cfg.VALID = edict()
+cfg.VALID.RAND_SAMPLERS = []
+cfg.VALID.RAND_MIRROR = True
+cfg.VALID.INIT_SHUFFLE = True
+cfg.VALID.EPOCH_SHUFFLE = True
+cfg.VALID.RAND_SEED = None
@@ -0,0 +1,25 @@
+import scipy.io as sio
+import cv2
+import h5py
+import numpy as np
+
+print "load gt.mat"
+gt = sio.loadmat("gt.mat")
+size = np.zeros((3, gt['imnames'].shape[1]))
+
+print "reading images..."
+for idx in xrange(gt['imnames'].shape[1]):
+	height, width, channels = cv2.imread(str(gt['imnames'][0, idx][0])).shape
+	size[0, idx] = float(height)
+	size[1, idx] = float(width)
+	size[2, idx] = float(channels)
+	if idx % 10000 == 0:
+		print str(idx) + " images..."
+		print size[:, idx]
+print "reading images finished..."
+
+print "store size..."
+with h5py.File('size.h5', 'w') as f:
+	size_store = f.create_dataset('size', size.shape)
+	size_store[:] = size[:]
+
@@ -0,0 +1,110 @@
+from imdb import Imdb
+import random
+
+class ConcatDB(Imdb):
+    """
+    ConcatDB is used to concatenate multiple imdbs to form a larger db.
+    It is very useful to combine multiple dataset with same classes.
+    Parameters
+    ----------
+    imdbs : Imdb or list of Imdb
+        Imdbs to be concatenated
+    shuffle : bool
+        whether to shuffle the initial list
+    """
+    def __init__(self, imdbs, shuffle):
+        super(ConcatDB, self).__init__('concatdb')
+        if not isinstance(imdbs, list):
+            imdbs = [imdbs]
+        self.imdbs = imdbs
+        self._check_classes()
+        self.image_set_index = self._load_image_set_index(shuffle)
+
+    def _check_classes(self):
+        """
+        check input imdbs, make sure they have same classes
+        """
+        try:
+            self.classes = self.imdbs[0].classes
+            self.num_classes = len(self.classes)
+        except AttributeError:
+            # fine, if no classes is provided
+            pass
+
+        if self.num_classes > 0:
+            for db in self.imdbs:
+                assert self.classes == db.classes, "Multiple imdb must have same classes"
+
+    def _load_image_set_index(self, shuffle):
+        """
+        get total number of images, init indices
+
+        Parameters
+        ----------
+        shuffle : bool
+            whether to shuffle the initial indices
+        """
+        self.num_images = 0
+        for db in self.imdbs:
+            self.num_images += db.num_images
+        indices = range(self.num_images)
+        if shuffle:
+            random.shuffle(indices)
+        return indices
+
+    def _locate_index(self, index):
+        """
+        given index, find out sub-db and sub-index
+
+        Parameters
+        ----------
+        index : int
+            index of a specific image
+
+        Returns
+        ----------
+        a tuple (sub-db, sub-index)
+        """
+        assert index >= 0 and index < self.num_images, "index out of range"
+        pos = self.image_set_index[index]
+        for k, v in enumerate(self.imdbs):
+            if pos >= v.num_images:
+                pos -= v.num_images
+            else:
+                return (k, pos)
+
+    def image_path_from_index(self, index):
+        """
+        given image index, find out full path
+
+        Parameters
+        ----------
+        index: int
+            index of a specific image
+
+        Returns
+        ----------
+        full path of this image
+        """
+        assert self.image_set_index is not None, "Dataset not initialized"
+        pos = self.image_set_index[index]
+        n_db, n_index = self._locate_index(index)
+        return self.imdbs[n_db].image_path_from_index(n_index)
+
+    def label_from_index(self, index):
+        """
+        given image index, return preprocessed ground-truth
+
+        Parameters
+        ----------
+        index: int
+            index of a specific image
+
+        Returns
+        ----------
+        ground-truths of this image
+        """
+        assert self.image_set_index is not None, "Dataset not initialized"
+        pos = self.image_set_index[index]
+        n_db, n_index = self._locate_index(index)
+        return self.imdbs[n_db].label_from_index(n_index)