Skip to content

Commit 047e4e8

Browse files
committed
first commit
0 parents  commit 047e4e8

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+8943
-0
lines changed

Diff for: README.md

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# SSD-text detection: Text Detector
2+
3+
This is a modified SSD model for text detection.
4+
5+
Compared to faster R-CNN, SSD is much faster. In my expriment, SSD only needs about 0.05s for each image.
6+
7+
### Disclaimer
8+
This is a re-implementation of mxnet SSD. The official
9+
repository is available [here](https://github.com/dmlc/mxnet/tree/master/example/ssd).
10+
The arXiv paper is available [here](http://arxiv.org/abs/1512.02325).
11+
12+
### Getting started
13+
* Build MXNet: Make sure the extra operators for this example is enabled, and please following the the official instructions [here](https://github.com/dmlc/mxnet/tree/master/example/ssd).
14+
15+
### Train the model
16+
I modify the original SSD on SynthText and ICDAR. Other datasets should
17+
be easily supported by adding subclass derived from class `Imdb` in `dataset/imdb.py`.
18+
See example of `dataset/pascal_voc.py` for details.
19+
* Download the converted pretrained `vgg16_reduced` model [here](https://dl.dropboxusercontent.com/u/39265872/vgg16_reduced.zip), unzip `.param` and `.json` files
20+
into `model/` directory by default.
21+
22+
To gain a good performance, we should train our model on SynthText which is a quite big dataset (about 40G) firstly, and then fine tune this model on ICDAR. If you want to apply this model for other applications, you can fine tune it on any dataset.
23+
24+
* Download the SynthText dataset [here](http://www.robots.ox.ac.uk/~vgg/data/scenetext/), and extract it into `data`.
25+
26+
Because SSD requires every image's size but SythText is too big, it will take too much time if we have to use opencv to read the images' size each time when we star training. So I use 'read_size.py' (`data/synthtext_img_size`) to creat a h5py file 'size.h5' to store the sizes of all images. You can copy this file to the extracted folder 'SynthText'.
27+
28+
29+
* Start training:
30+
```
31+
python train_synthtext.py
32+
```
33+
### Fine tune the model
34+
* Download the ICDAR challenge 2 dataset [here](http://rrc.cvc.uab.es/?ch=2&com=introduction), and extract it into `data`.
35+
36+
* Start training:
37+
```
38+
python train_icdar.py --finetune N
39+
```
40+
Please replace 'N' into an integer number which depends on the save model you train on SynthText.
41+
42+
### Try the demo
43+
* After training, you can try your model on test images. I give two demos here (`demo.py` and `demo_savefig.py`). `demo.py` can visualize the detection result, while `demo_savefig.py` can save the detection result as images.
44+
45+
When running `demo_savefig.py`, please give the test images path.
46+
* Run `demo.py`
47+
```
48+
# play with examples:
49+
python demo.py --epoch 0 --images ./data/demo/test.jpg --thresh 0.5
50+
```
51+
* Check `python demo.py --help` for more options.
52+
53+
When running `demo_savefig.py`, please give the test images folder path.
54+
* Run `demo_savefig.py`
55+
```
56+
# play with examples:
57+
python demo_savefig.py --epoch 0 --images ./data/demo/test --thresh 0.5
58+
```
59+
60+
61+

Diff for: config/__init__.py

Whitespace-only changes.

Diff for: config/config.py

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import os
2+
from easydict import EasyDict as edict
3+
from tools.rand_sampler import RandCropper, RandPadder
4+
5+
cfg = edict()
6+
cfg.ROOT_DIR = os.path.join(os.path.dirname(__file__), '..')
7+
8+
# training
9+
cfg.TRAIN = edict()
10+
cfg.TRAIN.RAND_SAMPLERS = [RandCropper(min_scale=1., max_trials=1, max_sample=1),
11+
RandCropper(min_scale=.3, min_aspect_ratio=.5, max_aspect_ratio=2., min_overlap=.1),
12+
RandCropper(min_scale=.3, min_aspect_ratio=.5, max_aspect_ratio=2., min_overlap=.3),
13+
RandCropper(min_scale=.3, min_aspect_ratio=.5, max_aspect_ratio=2., min_overlap=.5),
14+
RandCropper(min_scale=.3, min_aspect_ratio=.5, max_aspect_ratio=2., min_overlap=.7),
15+
RandPadder(max_scale=2., min_aspect_ratio=.5, max_aspect_ratio=2., min_gt_scale=.05),
16+
RandPadder(max_scale=3., min_aspect_ratio=.5, max_aspect_ratio=2., min_gt_scale=.05),
17+
RandPadder(max_scale=4., min_aspect_ratio=.5, max_aspect_ratio=2., min_gt_scale=.05),]
18+
# cfg.TRAIN.RAND_SAMPLERS = []
19+
cfg.TRAIN.RAND_MIRROR = True
20+
cfg.TRAIN.INIT_SHUFFLE = True
21+
cfg.TRAIN.EPOCH_SHUFFLE = True # shuffle training list after each epoch
22+
cfg.TRAIN.RAND_SEED = None
23+
cfg.TRAIN.RESIZE_EPOCH = 1 # save model every N epoch
24+
25+
26+
# validation
27+
cfg.VALID = edict()
28+
cfg.VALID.RAND_SAMPLERS = []
29+
cfg.VALID.RAND_MIRROR = True
30+
cfg.VALID.INIT_SHUFFLE = True
31+
cfg.VALID.EPOCH_SHUFFLE = True
32+
cfg.VALID.RAND_SEED = None

Diff for: data/synthtext_img_size/read_size.py

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import scipy.io as sio
2+
import cv2
3+
import h5py
4+
import numpy as np
5+
6+
print "load gt.mat"
7+
gt = sio.loadmat("gt.mat")
8+
size = np.zeros((3, gt['imnames'].shape[1]))
9+
10+
print "reading images..."
11+
for idx in xrange(gt['imnames'].shape[1]):
12+
height, width, channels = cv2.imread(str(gt['imnames'][0, idx][0])).shape
13+
size[0, idx] = float(height)
14+
size[1, idx] = float(width)
15+
size[2, idx] = float(channels)
16+
if idx % 10000 == 0:
17+
print str(idx) + " images..."
18+
print size[:, idx]
19+
print "reading images finished..."
20+
21+
print "store size..."
22+
with h5py.File('size.h5', 'w') as f:
23+
size_store = f.create_dataset('size', size.shape)
24+
size_store[:] = size[:]
25+

Diff for: dataset/__init__.py

Whitespace-only changes.

Diff for: dataset/concat_db.py

+110
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
from imdb import Imdb
2+
import random
3+
4+
class ConcatDB(Imdb):
5+
"""
6+
ConcatDB is used to concatenate multiple imdbs to form a larger db.
7+
It is very useful to combine multiple dataset with same classes.
8+
Parameters
9+
----------
10+
imdbs : Imdb or list of Imdb
11+
Imdbs to be concatenated
12+
shuffle : bool
13+
whether to shuffle the initial list
14+
"""
15+
def __init__(self, imdbs, shuffle):
16+
super(ConcatDB, self).__init__('concatdb')
17+
if not isinstance(imdbs, list):
18+
imdbs = [imdbs]
19+
self.imdbs = imdbs
20+
self._check_classes()
21+
self.image_set_index = self._load_image_set_index(shuffle)
22+
23+
def _check_classes(self):
24+
"""
25+
check input imdbs, make sure they have same classes
26+
"""
27+
try:
28+
self.classes = self.imdbs[0].classes
29+
self.num_classes = len(self.classes)
30+
except AttributeError:
31+
# fine, if no classes is provided
32+
pass
33+
34+
if self.num_classes > 0:
35+
for db in self.imdbs:
36+
assert self.classes == db.classes, "Multiple imdb must have same classes"
37+
38+
def _load_image_set_index(self, shuffle):
39+
"""
40+
get total number of images, init indices
41+
42+
Parameters
43+
----------
44+
shuffle : bool
45+
whether to shuffle the initial indices
46+
"""
47+
self.num_images = 0
48+
for db in self.imdbs:
49+
self.num_images += db.num_images
50+
indices = range(self.num_images)
51+
if shuffle:
52+
random.shuffle(indices)
53+
return indices
54+
55+
def _locate_index(self, index):
56+
"""
57+
given index, find out sub-db and sub-index
58+
59+
Parameters
60+
----------
61+
index : int
62+
index of a specific image
63+
64+
Returns
65+
----------
66+
a tuple (sub-db, sub-index)
67+
"""
68+
assert index >= 0 and index < self.num_images, "index out of range"
69+
pos = self.image_set_index[index]
70+
for k, v in enumerate(self.imdbs):
71+
if pos >= v.num_images:
72+
pos -= v.num_images
73+
else:
74+
return (k, pos)
75+
76+
def image_path_from_index(self, index):
77+
"""
78+
given image index, find out full path
79+
80+
Parameters
81+
----------
82+
index: int
83+
index of a specific image
84+
85+
Returns
86+
----------
87+
full path of this image
88+
"""
89+
assert self.image_set_index is not None, "Dataset not initialized"
90+
pos = self.image_set_index[index]
91+
n_db, n_index = self._locate_index(index)
92+
return self.imdbs[n_db].image_path_from_index(n_index)
93+
94+
def label_from_index(self, index):
95+
"""
96+
given image index, return preprocessed ground-truth
97+
98+
Parameters
99+
----------
100+
index: int
101+
index of a specific image
102+
103+
Returns
104+
----------
105+
ground-truths of this image
106+
"""
107+
assert self.image_set_index is not None, "Dataset not initialized"
108+
pos = self.image_set_index[index]
109+
n_db, n_index = self._locate_index(index)
110+
return self.imdbs[n_db].label_from_index(n_index)

0 commit comments

Comments
 (0)