diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e003d51
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,113 @@
+legacy/*
+.DS_Store
+debug/*
+*.DS_Store
+*.json
+*.mat
+src/.vscode/*
+preds/*
+*.h5
+*.pth
+*.checkpoint
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# dotenv
+.env
+
+# virtualenv
+.venv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+Notes.txt
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..be62acf
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "src/lib/models/networks/DCNv2"]
+	path = src/lib/models/networks/DCNv2
+	url = https://github.com/CharlesShang/DCNv2.git
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..3dbd32f
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,20 @@
+group: travis_latest
+dist: xenial  # ubuntu-16.04
+language: python
+cache: pip
+python:
+  - 3.6
+  - 3.7
+install:
+  - pip install flake8
+  - pip install -r requirements.txt
+before_script:
+  # stop the build if there are Python syntax errors or undefined names
+  - flake8 . --count --select=E9,F63,F72,F82 --show-source --statistics
+  # exit-zero treats all errors as warnings.  The GitHub editor is 127 chars wide
+  - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+script:
+  - true  # add other tests here
+notifications:
+  on_success: change
+  on_failure: change  # `always` will be the setting once code changes slow down
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..f3b96bb
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,24 @@
+MIT License
+
+Copyright - CenterSeg (c) 2020 Ajai John Chemmanam
+
+All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
diff --git a/NOTICE b/NOTICE
new file mode 100644
index 0000000..f3c1b1e
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1,182 @@
+Portions of this project are borrowed from multiple sources
+
+
+==============================================================================
+Centernet Licence
+==============================================================================
+
+MIT License
+
+Copyright - Centernet (c) 2019 Xingyi Zhou
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+
+==============================================================================
+tf-faster-rcnn licence
+==============================================================================
+
+MIT License
+
+Copyright (c) 2017 Xinlei Chen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+==============================================================================
+human-pose-estimation.pytorch licence
+==============================================================================
+    MIT License
+
+    Copyright (c) Microsoft Corporation. All rights reserved.
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE
+
+==============================================================================
+CornerNet licence
+==============================================================================
+
+BSD 3-Clause License
+
+Copyright (c) 2018, University of Michigan
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+
+
+==============================================================================
+DCNv2 licence
+==============================================================================
+
+BSD 3-Clause License
+
+Copyright (c) 2019, Charles Shang
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+==============================================================================
+DLA licence
+==============================================================================
+
+BSD 3-Clause License
+
+Copyright (c) 2018, Fisher Yu
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..265f282
--- /dev/null
+++ b/README.md
@@ -0,0 +1,74 @@
+# CenterSeg
+
+This repo uses Centernet and Conditional Convolutions for Instance Segmentation
+
+> [**Objects as Points**](http://arxiv.org/abs/1904.07850),  
+> [**CondInst: Conditional Convolutions for Instance Segmentation**](https://arxiv.org/abs/2003.05664)
+
+## Installation
+
+Please refer to [INSTALL.md](readme/INSTALL.md) for installation instructions.
+
+This repo supports both CPU and GPU Training and Inference.
+
+## Pre-Trained Models
+
+Will be uploaded soon
+
+## Setup
+```
+git clone {this repo}
+
+pip3 install -r requirements.txt
+```
+
+Compile DCN
+
+```
+cd src/lib/models/networks/DCNv2/
+
+python3 setup build develop
+```
+
+Compile NMS
+```
+cd src/lib/external
+
+python3 setup.py build_ext --inplace
+```
+
+#### Training
+
+###### For GPU
+```
+python3 main.py ctseg --exp_id coco_dla_1x --batch_size 10 --master_batch 5 --lr 1.25e-4 --gpus 0 --num_workers 4
+```
+
+###### FOR CPU
+```
+python3 main.py ctseg --exp_id coco_dla_1x --batch_size 2 --master_batch -1 --lr 1.25e-4 --gpus -1 --num_workers 4
+```
+
+#### Testing
+```
+python3 test.py ctseg --exp_id coco_dla_1x --keep_res --resume
+```
+
+#### Demo
+```
+python3 demo.py ctseg --exp_id coco_dla_1x --keep_res --resume --demo ../data/coco/val2017
+```
+
+## License
+
+CenterSeg is released under the MIT License (refer to the LICENSE file for details).
+This repo contains code borrowed from multiple sources. Please see their respective licenses.
+
+## Credits
+
+https://github.com/xingyizhou
+
+https://github.com/Epiphqny
+
+https://github.com/CaoWGG
+
diff --git a/data/.gitignore b/data/.gitignore
new file mode 100644
index 0000000..d6b7ef3
--- /dev/null
+++ b/data/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/exp/.gitignore b/exp/.gitignore
new file mode 100644
index 0000000..d6b7ef3
--- /dev/null
+++ b/exp/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/experiments/ctdet_coco_dla_1x.sh b/experiments/ctdet_coco_dla_1x.sh
new file mode 100644
index 0000000..ca69d8e
--- /dev/null
+++ b/experiments/ctdet_coco_dla_1x.sh
@@ -0,0 +1,10 @@
+cd src
+# train
+python main.py ctdet --exp_id coco_dla_1x --batch_size 128 --master_batch 9 --lr 5e-4 --gpus 0,1,2,3,4,5,6,7 --num_workers 16
+# test
+python test.py ctdet --exp_id coco_dla_1x --keep_res --resume
+# flip test
+python test.py ctdet --exp_id coco_dla_1x --keep_res --resume --flip_test 
+# multi scale test
+python test.py ctdet --exp_id coco_dla_1x --keep_res --resume --flip_test --test_scales 0.5,0.75,1,1.25,1.5
+cd ..
diff --git a/experiments/ctdet_coco_dla_2x.sh b/experiments/ctdet_coco_dla_2x.sh
new file mode 100644
index 0000000..8aeeed9
--- /dev/null
+++ b/experiments/ctdet_coco_dla_2x.sh
@@ -0,0 +1,12 @@
+cd src
+# train
+python main.py ctdet --exp_id coco_dla_2x --batch_size 128 --master_batch 9 --lr 5e-4 --gpus 0,1,2,3,4,5,6,7 --num_workers 16 --num_epochs 230 lr_step 180,210
+# or use the following command if your have coco_s2_dla_1x trained
+# python main.py ctdet --exp_id coco_dla_2x --batch_size 128 --master_batch 9 --lr 5e-4 --gpus 0,1,2,3,4,5,6,7 --num_workers 16 --load_model ../exp/ctdet/coco_dla_1x/model_90.pth --resume
+# test
+python test.py ctdet --exp_id coco_dla_2x --keep_res --resume
+# flip test
+python test.py ctdet --exp_id coco_dla_2x --keep_res --resume --flip_test
+# multi scale test
+python test.py ctdet --exp_id coco_dla_2x --keep_res --resume --flip_test --test_scales 0.5,0.75,1,1.25,1.5
+cd ..
diff --git a/experiments/ctdet_coco_hg.sh b/experiments/ctdet_coco_hg.sh
new file mode 100644
index 0000000..5df68a3
--- /dev/null
+++ b/experiments/ctdet_coco_hg.sh
@@ -0,0 +1,10 @@
+cd src
+# train
+python main.py ctdet --exp_id coco_hg --arch hourglass --batch_size 24 --master_batch 4 --lr 2.5e-4 --load_model ../models/ExtremeNet_500000.pth --gpus 0,1,2,3,4
+# test
+python test.py ctdet --exp_id coco_hg --arch hourglass --keep_res --resume
+# flip test
+python test.py ctdet --exp_id coco_hg --arch hourglass --keep_res --resume --flip_test 
+# multi scale test
+python test.py ctdet --exp_id coco_hg --arch hourglass --keep_res --resume --flip_test --test_scales 0.5,0.75,1,1.25,1.5
+cd ..
\ No newline at end of file
diff --git a/experiments/ctdet_coco_resdcn101.sh b/experiments/ctdet_coco_resdcn101.sh
new file mode 100644
index 0000000..156b6ca
--- /dev/null
+++ b/experiments/ctdet_coco_resdcn101.sh
@@ -0,0 +1,10 @@
+cd src
+# train
+python main.py ctdet --exp_id coco_resdcn101 --arch resdcn_101 --batch_size 96 --master_batch 5 --lr 3.75e-4 --gpus 0,1,2,3,4,5,6,7 --num_workers 16
+# test
+python test.py ctdet --exp_id coco_resdcn101 --keep_res --resume
+# flip test
+python test.py ctdet --exp_id coco_resdcn101 --keep_res --resume --flip_test 
+# multi scale test
+python test.py ctdet --exp_id coco_resdcn101 --keep_res --resume --flip_test --test_scales 0.5,0.75,1,1.25,1.5
+cd ..
diff --git a/experiments/ctdet_coco_resdcn18.sh b/experiments/ctdet_coco_resdcn18.sh
new file mode 100644
index 0000000..4a92f54
--- /dev/null
+++ b/experiments/ctdet_coco_resdcn18.sh
@@ -0,0 +1,10 @@
+cd src
+# train
+python main.py ctdet --exp_id coco_resdcn18 --arch resdcn_18 --batch_size 114 --master_batch 18 --lr 5e-4 --gpus 0,1,2,3 --num_workers 16
+# test
+python test.py ctdet --exp_id coco_resdcn18 --arch resdcn_18 --keep_res --resume
+# flip test
+python test.py ctdet --exp_id coco_resdcn18 --arch resdcn_18 --keep_res --resume --flip_test 
+# multi scale test
+python test.py ctdet --exp_id coco_resdcn18 --arch resdcn_18 --keep_res --resume --flip_test --test_scales 0.5,0.75,1,1.25,1.5
+cd ..
diff --git a/experiments/ctdet_pascal_dla_384.sh b/experiments/ctdet_pascal_dla_384.sh
new file mode 100644
index 0000000..d6dbdd8
--- /dev/null
+++ b/experiments/ctdet_pascal_dla_384.sh
@@ -0,0 +1,8 @@
+cd src
+# train
+python main.py ctdet --exp_id pascal_dla_384 --dataset pascal --num_epochs 70 --lr_step 45,60
+# test
+python test.py ctdet --exp_id pascal_dla_384 --dataset pascal --resume
+# flip test
+python test.py ctdet --exp_id pascal_dla_384 --dataset pascal --resume --flip_test
+cd ..
diff --git a/experiments/ctdet_pascal_dla_512.sh b/experiments/ctdet_pascal_dla_512.sh
new file mode 100644
index 0000000..22ca1f4
--- /dev/null
+++ b/experiments/ctdet_pascal_dla_512.sh
@@ -0,0 +1,8 @@
+cd src
+# train
+python main.py ctdet --exp_id pascal_dla_512 --dataset pascal --input_res 512 --num_epochs 70 --lr_step 45,60 --gpus 0,1
+# test
+python test.py ctdet --exp_id pascal_dla_512 --dataset pascal --input_res 512 --resume
+# flip test
+python test.py ctdet --exp_id pascal_dla_512 --dataset pascal --input_res 512 --resume --flip_test
+cd ..
diff --git a/experiments/ctdet_pascal_resdcn101_384.sh b/experiments/ctdet_pascal_resdcn101_384.sh
new file mode 100644
index 0000000..95d7fcb
--- /dev/null
+++ b/experiments/ctdet_pascal_resdcn101_384.sh
@@ -0,0 +1,8 @@
+cd src
+# train
+python main.py ctdet --exp_id pascal_resdcn101_384 --arch resdcn_101 --dataset pascal --num_epochs 70 --lr_step 45,60 --gpus 0,1
+# test
+python test.py ctdet --exp_id pascal_resdcn101_384 --arch resdcn_101 --dataset pascal --resume
+# flip test
+python test.py ctdet --exp_id pascal_resdcn101_384 --arch resdcn_101 --dataset pascal --resume --flip_test
+cd ..
diff --git a/experiments/ctdet_pascal_resdcn101_512.sh b/experiments/ctdet_pascal_resdcn101_512.sh
new file mode 100644
index 0000000..8eec739
--- /dev/null
+++ b/experiments/ctdet_pascal_resdcn101_512.sh
@@ -0,0 +1,8 @@
+cd src
+# train
+python main.py ctdet --exp_id pascal_resdcn101_512 --arch resdcn_101 --dataset pascal --input_res 512 --num_epochs 70 --lr_step 45,60 --gpus 0,1,2,3
+# test
+python test.py ctdet --exp_id pascal_resdcn101_512 --arch resdcn_101 --dataset pascal --input_res 512 --resume
+# flip test
+python test.py ctdet --exp_id pascal_resdcn101_512 --arch resdcn_101 --dataset pascal --input_res 512 --resume --flip_test
+cd ..
diff --git a/experiments/ctdet_pascal_resdcn18_384.sh b/experiments/ctdet_pascal_resdcn18_384.sh
new file mode 100644
index 0000000..eada159
--- /dev/null
+++ b/experiments/ctdet_pascal_resdcn18_384.sh
@@ -0,0 +1,8 @@
+cd src
+# train
+python main.py ctdet --exp_id pascal_resdcn18_384 --arch resdcn_18 --dataset pascal --num_epochs 70 --lr_step 45,60
+# test
+python test.py ctdet --exp_id pascal_resdcn18_384 --arch resdcn_18 --dataset pascal --resume
+# flip test
+python test.py ctdet --exp_id pascal_resdcn18_384 --arch resdcn_18 --dataset pascal --resume --flip_test
+cd ..
diff --git a/experiments/ctdet_pascal_resdcn18_512.sh b/experiments/ctdet_pascal_resdcn18_512.sh
new file mode 100644
index 0000000..abe7dbb
--- /dev/null
+++ b/experiments/ctdet_pascal_resdcn18_512.sh
@@ -0,0 +1,8 @@
+cd src
+# train
+python main.py ctdet --exp_id pascal_resdcn18_512 --arch resdcn_18 --dataset pascal --input_res 512 --num_epochs 70 --lr_step 45,60
+# test
+python test.py ctdet --exp_id pascal_resdcn18_512 --arch resdcn_18 --dataset pascal --input_res 512 --resume
+# flip test
+python test.py ctdet --exp_id pascal_resdcn18_512 --arch resdcn_18 --dataset pascal --input_res 512 --resume --flip_test
+cd ..
diff --git a/experiments/ctseg_coco_dla_1x.sh b/experiments/ctseg_coco_dla_1x.sh
new file mode 100644
index 0000000..8c5064c
--- /dev/null
+++ b/experiments/ctseg_coco_dla_1x.sh
@@ -0,0 +1,10 @@
+cd src
+# train
+python main.py ctseg --exp_id coco_dla_1x --batch_size 20 --master_batch 9 --lr 1.25e-4 --gpus 0,1 --num_workers 4
+# python3 main.py ctseg --exp_id coco_dla_1x --batch_size 2 --master_batch -1 --lr 1.25e-4 --gpus -1 --num_workers 0
+# test
+python test.py ctseg --exp_id coco_dla_1x --keep_res --resume
+
+# Visualize
+# python3 demo.py ctseg --exp_id coco_dla_1x --keep_res --resume --demo ../data/coco/val2017
+cd ..
\ No newline at end of file
diff --git a/experiments/ddd_3dop.sh b/experiments/ddd_3dop.sh
new file mode 100644
index 0000000..611fcac
--- /dev/null
+++ b/experiments/ddd_3dop.sh
@@ -0,0 +1,6 @@
+cd src
+# train
+python main.py ddd --exp_id 3dop --dataset kitti --kitti_split 3dop --batch_size 16 --master_batch 7 --num_epochs 70 --lr_step 45,60 --gpus 0,1
+# test
+python test.py ddd --exp_id 3dop --dataset kitti --kitti_split 3dop --resume
+cd ..
diff --git a/experiments/ddd_sub.sh b/experiments/ddd_sub.sh
new file mode 100644
index 0000000..117461f
--- /dev/null
+++ b/experiments/ddd_sub.sh
@@ -0,0 +1,6 @@
+cd src
+# train
+python main.py ddd --exp_id sub --dataset kitti --kitti_split subcnn --batch_size 16 --master_batch 7 --num_epochs 70 --lr_step 45,60 --gpus 0,1
+# test
+python test.py ddd --exp_id sub --dataset kitti --kitti_split subcnn --resume
+cd ..
diff --git a/experiments/exdet_coco_dla.sh b/experiments/exdet_coco_dla.sh
new file mode 100644
index 0000000..a04e81a
--- /dev/null
+++ b/experiments/exdet_coco_dla.sh
@@ -0,0 +1,10 @@
+cd src
+# train
+python main.py exdet --exp_id coco_dla --batch_size 64 --master_batch 1 --lr 2.5e-4 --gpus 0,1,2,3,4,5,6,7 --num_workers 8
+# test
+python test.py exdet --exp_id coco_dla --keep_res --resume
+# flip test
+python test.py exdet --exp_id coco_dla --keep_res --resume --flip_test 
+# multi scale test
+python test.py exdet --exp_id coco_dla --keep_res --resume --flip_test --test_scales 0.5,0.75,1,1.25,1.5
+cd ..
diff --git a/experiments/exdet_coco_hg.sh b/experiments/exdet_coco_hg.sh
new file mode 100644
index 0000000..73dc5b2
--- /dev/null
+++ b/experiments/exdet_coco_hg.sh
@@ -0,0 +1,10 @@
+cd src
+# train
+python main.py exdet --exp_id coco_hg --arch hourglass --batch_size 24 --master_batch 4 --lr 2.5e-4 --gpus 0,1,2,3,4
+# test
+python test.py exdet --exp_id coco_hg --arch hourglass --keep_res --resume
+# flip test
+python test.py exdet --exp_id coco_hg --arch hourglass --keep_res --resume --flip_test 
+# multi scale test
+python test.py exdet --exp_id coco_hg --arch hourglass --keep_res --resume --flip_test --test_scales 0.5,0.75,1,1.25,1.5
+cd ..
diff --git a/experiments/multi_pose_dla_1x.sh b/experiments/multi_pose_dla_1x.sh
new file mode 100644
index 0000000..b0eae74
--- /dev/null
+++ b/experiments/multi_pose_dla_1x.sh
@@ -0,0 +1,8 @@
+cd src
+# train
+python main.py multi_pose --exp_id dla_1x --dataset coco_hp --batch_size 128 --master_batch 9 --lr 5e-4 --load_model ../models/ctdet_coco_dla_2x.pth --gpus 0,1,2,3,4,5,6,7 --num_workers 16
+# test
+python test.py multi_pose --exp_id dla_1x --dataset coco_hp --keep_res --resume
+# flip test
+python test.py multi_pose --exp_id dla_1x --dataset coco_hp --keep_res --resume --flip_test
+cd ..
diff --git a/experiments/multi_pose_dla_3x.sh b/experiments/multi_pose_dla_3x.sh
new file mode 100644
index 0000000..625afc1
--- /dev/null
+++ b/experiments/multi_pose_dla_3x.sh
@@ -0,0 +1,10 @@
+cd src
+# train
+python main.py multi_pose --exp_id dla_3x --dataset coco_hp --batch_size 128 --master_batch 9 --lr 5e-4 --load_model ../models/ctdet_coco_dla_2x.pth --gpus 0,1,2,3,4,5,6,7 --num_workers 16 --num_epochs 320 lr_step 270,300
+# or use the following command if your have dla_1x trained
+# python main.py multi_pose --exp_id dla_3x --dataset coco_hp --batch_size 128 --master_batch 9 --lr 5e-4 --gpus 0,1,2,3,4,5,6,7 --num_workers 16 --load_model ../exp/multi_pose/dla_1x/model_90.pth --resume
+# test
+python test.py multi_pose --exp_id dla_3x --dataset coco_hp --keep_res --resume
+# flip test
+python test.py multi_pose --exp_id dla_3x --dataset coco_hp --keep_res --resume --flip_test
+cd ..
diff --git a/experiments/multi_pose_hg_1x.sh b/experiments/multi_pose_hg_1x.sh
new file mode 100644
index 0000000..7444b36
--- /dev/null
+++ b/experiments/multi_pose_hg_1x.sh
@@ -0,0 +1,8 @@
+cd src
+# train
+python main.py multi_pose --exp_id hg_1x --dataset coco_hp --arch hourglass --batch_size 24 --master_batch 4 --lr 2.5e-4 --load_model ../models/ctdet_coco_hg.pth --gpus 0,1,2,3,4 --num_epochs 50 --lr_step 40
+# test
+python test.py multi_pose --exp_id hg_1x --dataset coco_hp --arch hourglass --keep_res --resume
+# flip test
+python test.py multi_pose --exp_id hg_1x --dataset coco_hp --arch hourglass --keep_res --resume --flip_test
+cd ..
diff --git a/experiments/multi_pose_hg_3x.sh b/experiments/multi_pose_hg_3x.sh
new file mode 100644
index 0000000..da449db
--- /dev/null
+++ b/experiments/multi_pose_hg_3x.sh
@@ -0,0 +1,10 @@
+cd src
+# train
+python main.py multi_pose --exp_id hg_3x --dataset coco_hp --arch hourglass --batch_size 24 --master_batch 4 --lr 2.5e-4 -load_model ../models/ctdet_coco_hg.pth --gpus 0,1,2,3,4 --num_epochs 150 --lr_step 130
+# or use the following command if your have dla_1x trained
+# python main.py multi_pose --exp_id hg_3x --dataset coco_hp  --arch hourglass --batch_size 24 --master_batch 4 --lr 2.5e-4 --gpus 0,1,2,3,4 --num_epochs 150 --lr_step 130 --load_model ../exp/multi_pose/hg_1x/model_40.pth --resume
+# test
+python test.py multi_pose --exp_id hg_3x --dataset coco_hp --arch hourglass --keep_res --resume
+# flip test
+python test.py multi_pose --exp_id hg_3x --dataset coco_hp --arch hourglass --keep_res --resume --flip_test
+cd ..
diff --git a/images/16004479832_a748d55f21_k.jpg b/images/16004479832_a748d55f21_k.jpg
new file mode 100644
index 0000000..5ef0680
Binary files /dev/null and b/images/16004479832_a748d55f21_k.jpg differ
diff --git a/images/17790319373_bd19b24cfc_k.jpg b/images/17790319373_bd19b24cfc_k.jpg
new file mode 100644
index 0000000..8d7ce5e
Binary files /dev/null and b/images/17790319373_bd19b24cfc_k.jpg differ
diff --git a/images/18124840932_e42b3e377c_k.jpg b/images/18124840932_e42b3e377c_k.jpg
new file mode 100644
index 0000000..0e20882
Binary files /dev/null and b/images/18124840932_e42b3e377c_k.jpg differ
diff --git a/images/19064748793_bb942deea1_k.jpg b/images/19064748793_bb942deea1_k.jpg
new file mode 100644
index 0000000..6269382
Binary files /dev/null and b/images/19064748793_bb942deea1_k.jpg differ
diff --git a/images/24274813513_0cfd2ce6d0_k.jpg b/images/24274813513_0cfd2ce6d0_k.jpg
new file mode 100644
index 0000000..2f3271a
Binary files /dev/null and b/images/24274813513_0cfd2ce6d0_k.jpg differ
diff --git a/images/33823288584_1d21cf0a26_k.jpg b/images/33823288584_1d21cf0a26_k.jpg
new file mode 100644
index 0000000..c218118
Binary files /dev/null and b/images/33823288584_1d21cf0a26_k.jpg differ
diff --git a/images/33887522274_eebd074106_k.jpg b/images/33887522274_eebd074106_k.jpg
new file mode 100644
index 0000000..3173f58
Binary files /dev/null and b/images/33887522274_eebd074106_k.jpg differ
diff --git a/images/34501842524_3c858b3080_k.jpg b/images/34501842524_3c858b3080_k.jpg
new file mode 100644
index 0000000..26398dc
Binary files /dev/null and b/images/34501842524_3c858b3080_k.jpg differ
diff --git a/images/NOTICE b/images/NOTICE
new file mode 100644
index 0000000..506f76e
--- /dev/null
+++ b/images/NOTICE
@@ -0,0 +1,32 @@
+The demo images are licensed as United States government work:
+https://www.usa.gov/government-works
+
+The image files were obtained on Jan 13, 2018 from the following
+URLs.
+
+16004479832_a748d55f21_k.jpg
+https://www.flickr.com/photos/archivesnews/16004479832
+
+18124840932_e42b3e377c_k.jpg
+https://www.flickr.com/photos/usnavy/18124840932
+
+33887522274_eebd074106_k.jpg
+https://www.flickr.com/photos/usaid_pakistan/33887522274
+
+15673749081_767a7fa63a_k.jpg
+https://www.flickr.com/photos/usnavy/15673749081
+
+34501842524_3c858b3080_k.jpg
+https://www.flickr.com/photos/departmentofenergy/34501842524
+
+24274813513_0cfd2ce6d0_k.jpg
+https://www.flickr.com/photos/dhsgov/24274813513
+
+19064748793_bb942deea1_k.jpg
+https://www.flickr.com/photos/statephotos/19064748793
+
+33823288584_1d21cf0a26_k.jpg
+https://www.flickr.com/photos/cbpphotos/33823288584
+
+17790319373_bd19b24cfc_k.jpg
+https://www.flickr.com/photos/secdef/17790319373
diff --git a/models/.gitignore b/models/.gitignore
new file mode 100644
index 0000000..d6b7ef3
--- /dev/null
+++ b/models/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/readme/DATA.md b/readme/DATA.md
new file mode 100644
index 0000000..c455f94
--- /dev/null
+++ b/readme/DATA.md
@@ -0,0 +1,115 @@
+# Dataset preparation
+
+If you want to reproduce the results in the paper for benchmark evaluation and training, you will need to setup dataset.
+
+
+### COCO
+- Download the images (2017 Train, 2017 Val, 2017 Test) from [coco website](http://cocodataset.org/#download).
+- Download annotation files (2017 train/val and test image info) from [coco website](http://cocodataset.org/#download). 
+- Place the data (or create symlinks) to make the data folder like:
+
+  ~~~
+  ${CenterNet_ROOT}
+  |-- data
+  `-- |-- coco
+      `-- |-- annotations
+          |   |-- instances_train2017.json
+          |   |-- instances_val2017.json
+          |   |-- person_keypoints_train2017.json
+          |   |-- person_keypoints_val2017.json
+          |   |-- image_info_test-dev2017.json
+          |---|-- train2017
+          |---|-- val2017
+          `---|-- test2017
+  ~~~
+
+- [Optional] If you want to train ExtremeNet, generate extreme point annotation from segmentation:
+    
+    ~~~
+    cd $CenterNet_ROOT/tools/
+    python gen_coco_extreme_points.py
+    ~~~
+  It generates `instances_extreme_train2017.json` and `instances_extreme_val2017.json` in `data/coco/annotations/`. 
+
+### Pascal VOC
+
+- Run
+
+    ~~~
+    cd $CenterNet_ROOT/tools/
+    bash get_pascal_voc.sh
+    ~~~
+- The above script includes:
+    - Download, unzip, and move Pascal VOC images from the [VOC website](http://host.robots.ox.ac.uk/pascal/VOC/). 
+    - [Download](https://storage.googleapis.com/coco-dataset/external/PASCAL_VOC.zip) Pascal VOC annotation in COCO format (from [Detectron](https://github.com/facebookresearch/Detectron/tree/master/detectron/datasets/data)). 
+    - Combine train/val 2007/2012 annotation files into a single json. 
+
+
+- Move the created `voc` folder to `data` (or create symlinks) to make the data folder like:
+
+  ~~~
+  ${CenterNet_ROOT}
+  |-- data
+  `-- |-- voc
+      `-- |-- annotations
+          |   |-- pascal_trainval0712.json
+          |   |-- pascal_test2017.json
+          |-- images
+          |   |-- 000001.jpg
+          |   ......
+          `-- VOCdevkit
+  
+  ~~~
+  The `VOCdevkit` folder is needed to run the evaluation script from [faster rcnn](https://github.com/rbgirshick/py-faster-rcnn/blob/master/tools/reval.py).
+
+### KITTI
+
+- Download [images](http://www.cvlibs.net/download.php?file=data_object_image_2.zip), [annotations](http://www.cvlibs.net/download.php?file=data_object_label_2.zip), and [calibrations](http://www.cvlibs.net/download.php?file=data_object_calib.zip) from [KITTI website](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d) and unzip.
+
+- Download the train-val split of [3DOP](https://xiaozhichen.github.io/files/mv3d/imagesets.tar.gz) and [SubCNN](https://github.com/tanshen/SubCNN/tree/master/fast-rcnn/data/KITTI) and place the data as below
+
+  ~~~
+  ${CenterNet_ROOT}
+  |-- data
+  `-- |-- kitti
+      `-- |-- training
+          |   |-- image_2
+          |   |-- label_2
+          |   |-- calib
+          |-- ImageSets_3dop
+          |   |-- test.txt
+          |   |-- train.txt
+          |   |-- val.txt
+          |   |-- trainval.txt
+          `-- ImageSets_subcnn
+              |-- test.txt
+              |-- train.txt
+              |-- val.txt
+              |-- trainval.txt
+  ~~~
+
+- Run `python convert_kitti_to_coco.py` in `tools` to convert the annotation into COCO format. You can set `DEBUG=True` in `line 5` to visualize the annotation.
+
+- Link image folder
+
+  ~~~
+  cd ${CenterNet_ROOT}/data/kitti/
+  mkdir images
+  ln -s training/image_2 images/trainval
+  ~~~
+
+- The data structure should look like:
+
+  ~~~
+  ${CenterNet_ROOT}
+  |-- data
+  `-- |-- kitti
+      `-- |-- annotations
+          |   |-- kitti_3dop_train.json
+          |   |-- kitti_3dop_val.json
+          |   |-- kitti_subcnn_train.json
+          |   |-- kitti_subcnn_val.json
+          `-- images
+              |-- trainval
+              |-- test
+  ~~~
diff --git a/readme/DEVELOP.md b/readme/DEVELOP.md
new file mode 100644
index 0000000..183a294
--- /dev/null
+++ b/readme/DEVELOP.md
@@ -0,0 +1,19 @@
+# Develop
+
+This document provides tutorials to develop CenterNet. `lib/src/opts` lists a few more options that the current version supports.
+
+## New dataset
+Basically there are three steps:
+
+- Convert the dataset annotation to [COCO format](http://cocodataset.org/#format-data). Please refer to [src/tools/convert_kitti_to_coco.py](../src/tools/convert_kitti_to_coco.py) for an example to convert kitti format to coco format.
+- Create a dataset intilization file in `src/lib/datasets/dataset`. In most cases you can just copy `src/lib/datasets/dataset/coco.py` to your dataset name and change the category information, and annotation path.
+- Import your dataset at `src/lib/datasets/dataset_factory`.
+
+## New task
+
+You will need to add files to `src/lib/datasets/sample/`, `src/lib/datasets/trains/`, and `src/lib/datasets/detectors/`, which specify the data generation during training, the training targets, and the testing, respectively.
+
+## New architecture
+
+- Add your model file to `src/lib/models/networks/`. The model should accept a dict `heads` of `{name: channels}`, which specify the name of each network output and its number of channels. Make sure your model returns a list (for multiple stages. Single stage model should return a list containing a single element.). The element of the list is a dict contraining the same keys with `heads`.
+- Add your model in `model_factory` of `src/lib/models/model.py`.
diff --git a/readme/GETTING_STARTED.md b/readme/GETTING_STARTED.md
new file mode 100644
index 0000000..b059855
--- /dev/null
+++ b/readme/GETTING_STARTED.md
@@ -0,0 +1,89 @@
+# Getting Started
+
+This document provides tutorials to train and evaluate CenterNet. Before getting started, make sure you have finished [installation](INSTALL.md) and [dataset setup](DATA.md).
+
+## Benchmark evaluation
+
+First, download the models you want to evaluate from our [model zoo](MODEL_ZOO.md) and put them in `CenterNet_ROOT/models/`. 
+
+### COCO
+
+To evaluate COCO object detection with DLA
+run
+
+~~~
+python test.py ctdet --exp_id coco_dla --keep_res --load_model ../models/ctdet_coco_dla_2x.pth
+~~~
+
+This will give an AP of `37.4` if setup correctly. `--keep_res` is for keep the original image resolution. Without `--keep_res` it will resize the images to `512 x 512`. You can add `--flip_test` and `--flip_test --test_scales 0.5,0.75,1,1.25,1.5` to the above commend, for flip test and multi_scale test, respectively. The expected APs are `39.2` and `41.7`, respectively.
+
+To test with hourglass net, run
+
+~~~
+python test.py ctdet --exp_id coco_hg --arch hourglass --fix_res --load_model ../models/ctdet_coco_hg.pth
+~~~
+
+Similarly, to evaluate human pose estimation, run the following command for dla
+
+~~~
+python test.py multi_pose --exp_id dla --keep_res --load_model ../models/multi_pose_dla_3x.pth --flip_test
+~~~
+
+and the following for hourglass
+
+~~~
+python test.py multi_pose --exp_id hg --arch hourglass --keep_res --load_model ../models/multi_pose_dla_3x.pth --flip_test
+~~~
+
+The expected results can be found in the model zoo.
+
+### Pascal
+
+To evaluate object detection on Pascal VOC (test2007), run
+
+~~~
+python test.py ctdet --exp_id dla --dataset pascal --load_model ../models/ctdet_pascal_dla.pth --flip_test
+~~~
+
+Note that we fix the resolution during testing.
+And you can change to other network architectures and resolutions by specifying `--arch` and `--input_res 512`.
+
+### KITTI
+
+To evaluate the kitti dataset, first compile the evaluation tool (from [here](https://github.com/prclibo/kitti_eval)):
+
+~~~
+cd CenterNet_ROOT/src/tools/kitti_eval
+g++ -o evaluate_object_3d_offline evaluate_object_3d_offline.cpp -O3
+~~~
+
+Then run the evaluation with pretrained model:
+
+~~~
+python test.py ddd --exp_id 3dop --dataset kitti --kitti_split 3dop --load_model ../models/ddd_3dop.pth
+~~~
+
+to evaluate the 3DOP split. For the subcnn split, change `--kitti_split` to `subcnn` and load the corresponding models.
+Note that test time augmentation is not trivially applicable for 3D orientation.
+
+## Training
+
+We have packed all the training scripts in the [experiments](../experiments) folder.
+The experiment names are correspond to the model name in the [model zoo](MODEL_ZOO.md).
+The number of GPUs for each experiments can be found in the scripts and the model zoo.
+In the case that you don't have 8 GPUs, you can follow the [linear learning rate rule](https://arxiv.org/abs/1706.02677) to scale the learning rate as batch size.
+For example, to train COCO object detection with dla on 2 GPUs, run
+
+~~~
+python main.py ctdet --exp_id coco_dla --batch_size 32 --master_batch 15 --lr 1.25e-4  --gpus 0,1
+~~~
+
+The default learning rate is `1.25e-4` for batch size `32` (on 2 GPUs).
+By default, pytorch evenly splits the total batch size to each GPUs.
+`--master_batch` allows using different batchsize for the master GPU, which usually costs more memory than other GPUs.
+If it encounters GPU memory out, using slightly less batch size (e.g., `112` of `128`) with the same learning is fine.
+
+If the training is terminated before finishing, you can use the same commond with `--resume` to resume training. It will found the lastest model with the same `exp_id`.
+
+Our HourglassNet model is finetuned from the pretrained [ExtremeNet model](https://drive.google.com/file/d/1omiOUjWCrFbTJREypuZaODu0bOlF_7Fg/view?usp=sharing) (from the [ExtremeNet repo](https://github.com/xingyizhou/ExtremeNet)).
+You will need to download the model, run `python convert_hourglass_weight.py` to convert the model format, and load the model for training (see the [script](../experiments/ctdet_coco_hg.sh)).
diff --git a/readme/INSTALL.md b/readme/INSTALL.md
new file mode 100644
index 0000000..8798cde
--- /dev/null
+++ b/readme/INSTALL.md
@@ -0,0 +1,74 @@
+# Installation
+
+
+The code was tested on Ubuntu 16.04, with [Anaconda](https://www.anaconda.com/download) Python 3.6 and [PyTorch]((http://pytorch.org/)) v0.4.1. NVIDIA GPUs are needed for both training and testing.
+After install Anaconda:
+
+0. [Optional but recommended] create a new conda environment. 
+
+    ~~~
+    conda create --name CenterNet python=3.6
+    ~~~
+    And activate the environment.
+    
+    ~~~
+    conda activate CenterNet
+    ~~~
+
+1. Install pytorch0.4.1:
+
+    ~~~
+    conda install pytorch=0.4.1 torchvision -c pytorch
+    ~~~
+    
+    And disable cudnn batch normalization(Due to [this issue](https://github.com/xingyizhou/pytorch-pose-hg-3d/issues/16)).
+    
+     ~~~
+    # PYTORCH=/path/to/pytorch # usually ~/anaconda3/envs/CenterNet/lib/python3.6/site-packages/
+    # for pytorch v0.4.0
+    sed -i "1194s/torch\.backends\.cudnn\.enabled/False/g" ${PYTORCH}/torch/nn/functional.py
+    # for pytorch v0.4.1
+    sed -i "1254s/torch\.backends\.cudnn\.enabled/False/g" ${PYTORCH}/torch/nn/functional.py
+     ~~~
+     
+     For other pytorch version, you can manually open `torch/nn/functional.py` and find the line with `torch.batch_norm` and replace the `torch.backends.cudnn.enabled` with `False`. We observed slight worse training results without doing so. 
+     
+2. Install [COCOAPI](https://github.com/cocodataset/cocoapi):
+
+    ~~~
+    # COCOAPI=/path/to/clone/cocoapi
+    git clone https://github.com/cocodataset/cocoapi.git $COCOAPI
+    cd $COCOAPI/PythonAPI
+    make
+    python setup.py install --user
+    ~~~
+
+3. Clone this repo:
+
+    ~~~
+    CenterNet_ROOT=/path/to/clone/CenterNet
+    git clone https://github.com/xingyizhou/CenterNet $CenterNet_ROOT
+    ~~~
+
+
+4. Install the requirements
+
+    ~~~
+    pip install -r requirements.txt
+    ~~~
+    
+    
+5. Compile deformable convolutional (from [DCNv2](https://github.com/CharlesShang/DCNv2/tree/pytorch_0.4)).
+
+    ~~~
+    cd $CenterNet_ROOT/src/lib/models/networks/DCNv2
+    ./make.sh
+    ~~~
+6. [Optional, only required if you are using extremenet or multi-scale testing] Compile NMS if your want to use multi-scale testing or test ExtremeNet.
+
+    ~~~
+    cd $CenterNet_ROOT/src/lib/external
+    make
+    ~~~
+
+7. Download pertained models for [detection]() or [pose estimation]() and move them to `$CenterNet_ROOT/models/`. More models can be found in [Model zoo](MODEL_ZOO.md).
diff --git a/readme/MODEL_ZOO.md b/readme/MODEL_ZOO.md
new file mode 100644
index 0000000..b290d0f
--- /dev/null
+++ b/readme/MODEL_ZOO.md
@@ -0,0 +1,89 @@
+# MODEL ZOO
+
+### Common settings and notes
+
+- The experiments are run with pytorch 0.4.1, CUDA 9.0, and CUDNN 7.1.
+- Training times are measured on our servers with 8 TITAN V GPUs (12 GB Memeory).
+- Testing times are measured on our local machine with TITAN Xp GPU. 
+- The models can be downloaded directly from [Google drive](https://drive.google.com/open?id=1px-Xg7jXSC79QqgsD1AAGJQkuf5m0zh_).
+
+## Object Detection
+
+
+### COCO
+
+| Model                    | GPUs |Train time(h)| Test time (ms) |   AP               |  Download | 
+|--------------------------|------|-------------|----------------|--------------------|-----------|
+|[ctdet\_coco\_hg](../experiments/ctdet_coco_hg.sh)       |   5  |109          | 71 / 129 / 674 | 40.3 / 42.2 / 45.1 | [model](https://drive.google.com/open?id=1cNyDmyorOduMRsgXoUnuyUiF6tZNFxaG) |
+|[ctdet\_coco\_dla\_1x](../experiments/ctdet_coco_dla_1x.sh)  |   8  | 57          |  19 / 36 / 248 | 36.3 / 38.2 / 40.7 | [model](https://drive.google.com/open?id=1r89_KNXyDyvUp8NggduG9uKQTMU2DsK_) |
+|[ctdet\_coco\_dla\_2x](../experiments/ctdet_coco_dla_2x.sh)  |   8  | 92          |  19 / 36 / 248 | 37.4 / 39.2 / 41.7 | [model](https://drive.google.com/open?id=1pl_-ael8wERdUREEnaIfqOV_VF2bEVRT) |
+|[ctdet\_coco\_resdcn101](../experiments/ctdet_coco_resdcn101.sh)|   8  | 65          |  22 / 40 / 259 | 34.6 / 36.2 / 39.3 | [model](https://drive.google.com/open?id=1bTJCbAc1szA9lWU-fvVw52lqR3U2TTry) |
+|[ctdet\_coco\_resdcn18](../experiments/ctdet_coco_resdcn18.sh) |   4  | 28          |  7 / 14 / 81   | 28.1 / 30.0 / 33.2 | [model](https://drive.google.com/open?id=1b-_sjq1Pe_dVxt5SeFmoadMfiPTPZqpz) |
+|[exdet\_coco\_hg](../experiments/exdet_coco_hg.sh)       |   5  |215          | 134 / 246/1340 | 35.8 / 39.8 / 42.4 | [model](https://drive.google.com/open?id=1-5bT5ZF8bXriJ-wAvOjJFrBLvZV2-mlV) |
+|[exdet\_coco\_dla](../experiments/exdet_coco_dla.sh)      |   8  |133          | 51 / 90 / 481  | 33.0 / 36.5 / 38.5 | [model](https://drive.google.com/open?id=1PFcEqN0KjFuq9XaqzB7TkVD3pvXZx04e) |
+
+#### Notes
+
+- All models are trained on COCO train 2017 and evaluated on val 2017. 
+- We show test time and AP with no augmentation / flip augmentation / multi scale (0.5, 0.75, 1, 1.25, 1.5) augmentation. 
+- Results on COCO test-dev can be found in the paper or add `--trainval` for `test.py`. 
+- exdet is our re-implementation of [ExtremeNet](https://github.com/xingyizhou/ExtremeNet). The testing does not include edge aggregation.
+- For dla and resnets, `1x` means the training schedule that train 140 epochs with learning rate dropped 10 times at the 90 and 120 epoch (following [SimpleBaseline](https://github.com/Microsoft/human-pose-estimation.pytorch)). `2x` means train 230 epochs with learning rate dropped 10 times at the 180 and 210 epoch. The training schedules are **not** carefully investigated.
+- The hourglass trained schedule follows [ExtremeNet](https://github.com/xingyizhou/ExtremeNet): trains 50 epochs (approximately 250000 iterations in batch size 24) and drops learning rate at the 40 epoch.
+- Testing time include network forwarding time, decoding time, and nms time (for ExtremeNet).
+- We observed up to 0.4 AP performance jitter due to randomness in training. 
+
+### Pascal VOC
+
+| Model                           |GPUs| Train time (h)| Test time (ms) | mAP  | Download  |
+|---------------------------------|----|---------------|----------------|------|-----------|
+|[ctdet\_pascal\_dla\_384](../experiments/ctdet_pascal_dla_384.sh)      | 1  |15             | 20             | 79.3 | [model](https://drive.google.com/open?id=1IC3FZkxAQHm2rxoIGmS4YluYpZxwYkJf) |
+|[ctdet\_pascal\_dla\_512](../experiments/ctdet_pascal_dla_512.sh)      | 2  |15             | 30             | 80.7 | [model](https://drive.google.com/open?id=1jIfK9EyqzNcupxGsp3YRnEiewrIG4_Ma) |
+|[ctdet\_pascal\_resdcn18\_384](../experiments/ctdet_pascal_resdcn18_384.sh) | 1  |3              | 7              | 72.6 | [model](https://drive.google.com/open?id=1Kq27D1uoPZK42j2alDWmCGyqRU2ob1BX) |
+|[ctdet\_pascal\_resdcn18\_512](../experiments/ctdet_pascal_resdcn18_512.sh) | 1  |5              | 10             | 75.7 | [model](https://drive.google.com/open?id=1MRUJTTJ4-ZDN0Y-zQOqQBqjrQMcXFzet) |
+|[ctdet\_pascal\_resdcn101\_384](../experiments/ctdet_pascal_resdcn101_384.sh)| 2  |7              | 22             | 77.1 | [model](https://drive.google.com/open?id=11YXE04zILuXA5-kaYQEEg0ljNKBe6GPO) |
+|[ctdet\_pascal\_resdcn101\_512](../experiments/ctdet_pascal_resdcn101_512.sh)| 4  |7              | 33             | 78.7 | [model](https://drive.google.com/open?id=1xhEf-a_y2Di6YdyPpCIj0-kVFjQvDf9N) |
+
+#### Notes
+- All models are trained on trainval 07+12 and tested on test 2007.
+- Flip test is used by default.
+- Training schedule: train for 70 epochs with learning rate dropped 10 times at the 45 and 60 epoch.
+- We observed up to 1 mAP performance jitter due to randomness in training.
+
+## Human pose estimation
+
+### COCO
+
+| Model                    | GPUs |Train time(h)| Test time (ms) |   AP        |  Download | 
+|--------------------------|------|-------------|----------------|-------------|-----------|
+|[multi\_pose\_hg_1x](../experiments/multi_pose_hg_1x.sh)    |   5  |62           | 151            | 58.7        | [model](https://drive.google.com/open?id=1HBB5KRaSj-m-vtpGESm7_3evNP5Y84RS) |
+|[multi\_pose\_hg_3x](../experiments/multi_pose_hg_3x.sh)    |   5  |188          | 151            | 64.0        | [model](https://drive.google.com/open?id=1n6EvwhTbz7LglVXXlL9irJia7YuakHdB) |
+|[multi\_pose\_dla_1x](../experiments/multi_pose_dla_1x.sh)   |   8  |30           | 44             | 54.7        | [model](https://drive.google.com/open?id=1VeiRtuXfCbmhQNGV-XWL6elUzpuWN-4K) |
+|[multi\_pose\_dla_3x](../experiments/multi_pose_dla_3x.sh)   |   8  |70           | 44             | 58.9        | [model](https://drive.google.com/open?id=1PO1Ax_GDtjiemEmDVD7oPWwqQkUu28PI) |
+
+#### Notes
+- All models are trained on keypoint train 2017 images which contains at least one human with keypoint annotations (64115 images).
+- The evaluation is done on COCO keypoint val 2017 (5000 images).
+- Flip test is used by default.
+- The models are fine-tuned from the corresponding center point detection models.
+- Dla training schedule: `1x`: train for 140 epochs with learning rate dropped 10 times at the 90 and 120 epoch.`3x`: train for 320 epochs with learning rate dropped 10 times at the 270 and 300 epoch.
+- Hourglass training schedule: `1x`: train for 50 epochs with learning rate dropped 10 times at the 40 epoch.`3x`: train for 150 epochs with learning rate dropped 10 times at the 130 epoch.
+
+## 3D bounding box detection
+
+#### Notes
+- The 3dop split is from [3DOP](https://papers.nips.cc/paper/5644-3d-object-proposals-for-accurate-object-class-detection) and the suborn split is from [SubCNN](https://github.com/tanshen/SubCNN).
+- No augmentation is used in testing.
+- The models are trained for 70 epochs with learning rate dropped at the 45 and 60 epoch.
+
+### KITTI 3DOP split
+
+|Model       |GPUs|Train time|Test time|AP-E|AP-M|AP-H|AOS-E|AOS-M|AOS-H|BEV-E|BEV-M|BEV-H| Download |
+|------------|----|----------|---------|----|----|----|-----|-----|-----|-----|-----|-----|----------|
+|[ddd_3dop](../experiments/ddd_3dop.sh)|2   | 7h       |  31ms   |96.9|87.8|79.2|93.9 |84.3 |75.7 |34.0 |30.5 |26.8 | [model](https://drive.google.com/open?id=1znsM6E-aVTkATreDuUVxoU0ajL1az8rz)|
+
+### KITTI SubCNN split
+
+|Model       |GPUs|Train time|Test time|AP-E|AP-M|AP-H|AOS-E|AOS-M|AOS-H|BEV-E|BEV-M|BEV-H| Download |
+|------------|----|----------|---------|----|----|----|-----|-----|-----|-----|-----|-----|----------|
+|[ddd_sub](../experiments/ddd_sub.sh) |2   | 7h       |  31ms   |89.6|79.8|70.3|85.7 |75.2 |65.9 |34.9 |27.7 |26.4 | [model](https://drive.google.com/open?id=15XuJxTxCBnA8O37M_ghjppnWmVnjC0Hp)|
\ No newline at end of file
diff --git a/readme/det1.png b/readme/det1.png
new file mode 100644
index 0000000..f85faab
Binary files /dev/null and b/readme/det1.png differ
diff --git a/readme/det2.png b/readme/det2.png
new file mode 100644
index 0000000..d88445e
Binary files /dev/null and b/readme/det2.png differ
diff --git a/readme/fig2.png b/readme/fig2.png
new file mode 100644
index 0000000..6e0eb64
Binary files /dev/null and b/readme/fig2.png differ
diff --git a/readme/pose1.png b/readme/pose1.png
new file mode 100644
index 0000000..f9f832b
Binary files /dev/null and b/readme/pose1.png differ
diff --git a/readme/pose2.png b/readme/pose2.png
new file mode 100644
index 0000000..386def3
Binary files /dev/null and b/readme/pose2.png differ
diff --git a/readme/pose3.png b/readme/pose3.png
new file mode 100644
index 0000000..97280ef
Binary files /dev/null and b/readme/pose3.png differ
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..d18aef0
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+opencv-python
+Cython
+numba
+progress
+matplotlib
+easydict
+scipy
diff --git a/src/_init_paths.py b/src/_init_paths.py
new file mode 100644
index 0000000..db11e08
--- /dev/null
+++ b/src/_init_paths.py
@@ -0,0 +1,12 @@
+import os.path as osp
+import sys
+
+def add_path(path):
+    if path not in sys.path:
+        sys.path.insert(0, path)
+
+this_dir = osp.dirname(__file__)
+
+# Add lib to PYTHONPATH
+lib_path = osp.join(this_dir, 'lib')
+add_path(lib_path)
diff --git a/src/demo.py b/src/demo.py
new file mode 100644
index 0000000..f2b5e77
--- /dev/null
+++ b/src/demo.py
@@ -0,0 +1,59 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import _init_paths
+
+import os
+import cv2
+
+from opts import opts
+from detectors.detector_factory import detector_factory
+
+image_ext = ['jpg', 'jpeg', 'png', 'webp']
+video_ext = ['mp4', 'mov', 'avi', 'mkv']
+time_stats = ['tot', 'load', 'pre', 'net', 'dec', 'post', 'merge']
+
+
+def demo(opt):
+    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
+    opt.debug = max(opt.debug, 1)
+    Detector = detector_factory[opt.task]
+    detector = Detector(opt)
+
+    if opt.demo == 'webcam' or \
+            opt.demo[opt.demo.rfind('.') + 1:].lower() in video_ext:
+        cam = cv2.VideoCapture(0 if opt.demo == 'webcam' else opt.demo)
+        detector.pause = False
+        while True:
+            _, img = cam.read()
+            cv2.imshow('input', img)
+            ret = detector.run(img)
+            time_str = ''
+            for stat in time_stats:
+                time_str = time_str + '{} {:.3f}s |'.format(stat, ret[stat])
+            print(time_str)
+            if cv2.waitKey(1) == 27:
+                return  # esc to quit
+    else:
+        if os.path.isdir(opt.demo):
+            image_names = []
+            ls = os.listdir(opt.demo)
+            for file_name in sorted(ls):
+                ext = file_name[file_name.rfind('.') + 1:].lower()
+                if ext in image_ext:
+                    image_names.append(os.path.join(opt.demo, file_name))
+        else:
+            image_names = [opt.demo]
+
+        for (image_name) in image_names:
+            ret = detector.run(image_name)
+            time_str = ''
+            for stat in time_stats:
+                time_str = time_str + '{} {:.3f}s |'.format(stat, ret[stat])
+            print(time_str)
+
+
+if __name__ == '__main__':
+    opt = opts().init()
+    demo(opt)
diff --git a/src/lib/customdatasets/dataset/coco.py b/src/lib/customdatasets/dataset/coco.py
new file mode 100644
index 0000000..d0efc53
--- /dev/null
+++ b/src/lib/customdatasets/dataset/coco.py
@@ -0,0 +1,129 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pycocotools.coco as coco
+from pycocotools.cocoeval import COCOeval
+import numpy as np
+import json
+import os
+
+import torch.utils.data as data
+
+class COCO(data.Dataset):
+  num_classes = 80
+  default_resolution = [512, 512]
+  mean = np.array([0.40789654, 0.44719302, 0.47026115],
+                   dtype=np.float32).reshape(1, 1, 3)
+  std  = np.array([0.28863828, 0.27408164, 0.27809835],
+                   dtype=np.float32).reshape(1, 1, 3)
+
+  def __init__(self, opt, split):
+    super(COCO, self).__init__()
+    self.data_dir = os.path.join(opt.data_dir, 'coco')
+    self.img_dir = os.path.join(self.data_dir, '{}2017'.format(split))
+    if split == 'test':
+      self.annot_path = os.path.join(
+          self.data_dir, 'annotations', 
+          'image_info_test-dev2017.json').format(split)
+    else:
+      if opt.task == 'exdet':
+        self.annot_path = os.path.join(
+          self.data_dir, 'annotations', 
+          'instances_extreme_{}2017.json').format(split)
+      else:
+        self.annot_path = os.path.join(
+          self.data_dir, 'annotations', 
+          'instances_{}2017.json').format(split)
+    self.max_objs = 128
+    self.class_name = [
+      '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane',
+      'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
+      'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
+      'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
+      'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
+      'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
+      'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass',
+      'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
+      'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
+      'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+      'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+      'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+      'scissors', 'teddy bear', 'hair drier', 'toothbrush']
+    self._valid_ids = [
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 
+      14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 
+      24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 
+      37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 
+      48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 
+      58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 
+      72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 
+      82, 84, 85, 86, 87, 88, 89, 90]
+    self.cat_ids = {v: i for i, v in enumerate(self._valid_ids)}
+    self.voc_color = [(v // 32 * 64 + 64, (v // 8) % 4 * 64, v % 8 * 32) \
+                      for v in range(1, self.num_classes + 1)]
+    self._data_rng = np.random.RandomState(123)
+    self._eig_val = np.array([0.2141788, 0.01817699, 0.00341571],
+                             dtype=np.float32)
+    self._eig_vec = np.array([
+        [-0.58752847, -0.69563484, 0.41340352],
+        [-0.5832747, 0.00994535, -0.81221408],
+        [-0.56089297, 0.71832671, 0.41158938]
+    ], dtype=np.float32)
+    # self.mean = np.array([0.485, 0.456, 0.406], np.float32).reshape(1, 1, 3)
+    # self.std = np.array([0.229, 0.224, 0.225], np.float32).reshape(1, 1, 3)
+
+    self.split = split
+    self.opt = opt
+
+    print('==> initializing coco 2017 {} data.'.format(split))
+    self.coco = coco.COCO(self.annot_path)
+    self.images = self.coco.getImgIds()
+    self.num_samples = len(self.images)
+
+    print('Loaded {} {} samples'.format(split, self.num_samples))
+
+  def _to_float(self, x):
+    return float("{:.2f}".format(x))
+
+  def convert_eval_format(self, all_bboxes):
+    # import pdb; pdb.set_trace()
+    detections = []
+    for image_id in all_bboxes:
+      for cls_ind in all_bboxes[image_id]:
+        category_id = self._valid_ids[cls_ind - 1]
+        for bbox in all_bboxes[image_id][cls_ind]:
+          bbox[2] -= bbox[0]
+          bbox[3] -= bbox[1]
+          score = bbox[4]
+          bbox_out  = list(map(self._to_float, bbox[0:4]))
+
+          detection = {
+              "image_id": int(image_id),
+              "category_id": int(category_id),
+              "bbox": bbox_out,
+              "score": float("{:.2f}".format(score))
+          }
+          if len(bbox) > 5:
+              extreme_points = list(map(self._to_float, bbox[5:13]))
+              detection["extreme_points"] = extreme_points
+          detections.append(detection)
+    return detections
+
+  def __len__(self):
+    return self.num_samples
+
+  def save_results(self, results, save_dir):
+    json.dump(self.convert_eval_format(results), 
+                open('{}/results.json'.format(save_dir), 'w'))
+  
+  def run_eval(self, results, save_dir):
+    # result_json = os.path.join(save_dir, "results.json")
+    # detections  = self.convert_eval_format(results)
+    # json.dump(detections, open(result_json, "w"))
+    self.save_results(results, save_dir)
+    coco_dets = self.coco.loadRes('{}/results.json'.format(save_dir))
+    coco_eval = COCOeval(self.coco, coco_dets, "bbox")
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
diff --git a/src/lib/customdatasets/dataset/coco_hp.py b/src/lib/customdatasets/dataset/coco_hp.py
new file mode 100644
index 0000000..dd8fd3a
--- /dev/null
+++ b/src/lib/customdatasets/dataset/coco_hp.py
@@ -0,0 +1,120 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pycocotools.coco as coco
+from pycocotools.cocoeval import COCOeval
+import numpy as np
+import json
+import os
+
+import torch.utils.data as data
+
+class COCOHP(data.Dataset):
+  num_classes = 1
+  num_joints = 17
+  default_resolution = [512, 512]
+  mean = np.array([0.40789654, 0.44719302, 0.47026115],
+                   dtype=np.float32).reshape(1, 1, 3)
+  std  = np.array([0.28863828, 0.27408164, 0.27809835],
+                   dtype=np.float32).reshape(1, 1, 3)
+  flip_idx = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], 
+              [11, 12], [13, 14], [15, 16]]
+  def __init__(self, opt, split):
+    super(COCOHP, self).__init__()
+    self.edges = [[0, 1], [0, 2], [1, 3], [2, 4], 
+                  [4, 6], [3, 5], [5, 6], 
+                  [5, 7], [7, 9], [6, 8], [8, 10], 
+                  [6, 12], [5, 11], [11, 12], 
+                  [12, 14], [14, 16], [11, 13], [13, 15]]
+    
+    self.acc_idxs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
+    self.data_dir = os.path.join(opt.data_dir, 'coco')
+    self.img_dir = os.path.join(self.data_dir, '{}2017'.format(split))
+    if split == 'test':
+      self.annot_path = os.path.join(
+          self.data_dir, 'annotations', 
+          'image_info_test-dev2017.json').format(split)
+    else:
+      self.annot_path = os.path.join(
+        self.data_dir, 'annotations', 
+        'person_keypoints_{}2017.json').format(split)
+    self.max_objs = 32
+    self._data_rng = np.random.RandomState(123)
+    self._eig_val = np.array([0.2141788, 0.01817699, 0.00341571],
+                             dtype=np.float32)
+    self._eig_vec = np.array([
+        [-0.58752847, -0.69563484, 0.41340352],
+        [-0.5832747, 0.00994535, -0.81221408],
+        [-0.56089297, 0.71832671, 0.41158938]
+    ], dtype=np.float32)
+    self.split = split
+    self.opt = opt
+
+    print('==> initializing coco 2017 {} data.'.format(split))
+    self.coco = coco.COCO(self.annot_path)
+    image_ids = self.coco.getImgIds()
+
+    if split == 'train':
+      self.images = []
+      for img_id in image_ids:
+        idxs = self.coco.getAnnIds(imgIds=[img_id])
+        if len(idxs) > 0:
+          self.images.append(img_id)
+    else:
+      self.images = image_ids
+    self.num_samples = len(self.images)
+    print('Loaded {} {} samples'.format(split, self.num_samples))
+
+  def _to_float(self, x):
+    return float("{:.2f}".format(x))
+
+  def convert_eval_format(self, all_bboxes):
+    # import pdb; pdb.set_trace()
+    detections = []
+    for image_id in all_bboxes:
+      for cls_ind in all_bboxes[image_id]:
+        category_id = 1
+        for dets in all_bboxes[image_id][cls_ind]:
+          bbox = dets[:4]
+          bbox[2] -= bbox[0]
+          bbox[3] -= bbox[1]
+          score = dets[4]
+          bbox_out  = list(map(self._to_float, bbox))
+          keypoints = np.concatenate([
+            np.array(dets[5:39], dtype=np.float32).reshape(-1, 2), 
+            np.ones((17, 1), dtype=np.float32)], axis=1).reshape(51).tolist()
+          keypoints  = list(map(self._to_float, keypoints))
+
+          detection = {
+              "image_id": int(image_id),
+              "category_id": int(category_id),
+              "bbox": bbox_out,
+              "score": float("{:.2f}".format(score)),
+              "keypoints": keypoints
+          }
+          detections.append(detection)
+    return detections
+
+  def __len__(self):
+    return self.num_samples
+
+  def save_results(self, results, save_dir):
+    json.dump(self.convert_eval_format(results), 
+              open('{}/results.json'.format(save_dir), 'w'))
+
+
+  def run_eval(self, results, save_dir):
+    # result_json = os.path.join(opt.save_dir, "results.json")
+    # detections  = convert_eval_format(all_boxes)
+    # json.dump(detections, open(result_json, "w"))
+    self.save_results(results, save_dir)
+    coco_dets = self.coco.loadRes('{}/results.json'.format(save_dir))
+    coco_eval = COCOeval(self.coco, coco_dets, "keypoints")
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    coco_eval = COCOeval(self.coco, coco_dets, "bbox")
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
\ No newline at end of file
diff --git a/src/lib/customdatasets/dataset/kitti.py b/src/lib/customdatasets/dataset/kitti.py
new file mode 100644
index 0000000..b33664d
--- /dev/null
+++ b/src/lib/customdatasets/dataset/kitti.py
@@ -0,0 +1,89 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch.utils.data as data
+import pycocotools.coco as coco
+import numpy as np
+import torch
+import json
+import cv2
+import os
+import math
+
+import torch.utils.data as data
+
+
+class KITTI(data.Dataset):
+  num_classes = 3
+  default_resolution = [384, 1280]
+  mean = np.array([0.485, 0.456, 0.406], np.float32).reshape(1, 1, 3)
+  std = np.array([0.229, 0.224, 0.225], np.float32).reshape(1, 1, 3)
+
+  def __init__(self, opt, split):
+    super(KITTI, self).__init__()
+    self.data_dir = os.path.join(opt.data_dir, 'kitti')
+    self.img_dir = os.path.join(self.data_dir, 'images', 'trainval')
+    if opt.trainval:
+      split = 'trainval' if split == 'train' else 'test'
+      self.img_dir = os.path.join(self.data_dir, 'images', split)
+      self.annot_path = os.path.join(
+        self.data_dir, 'annotations', 'kitti_{}.json').format(split)
+    else:
+      self.annot_path = os.path.join(self.data_dir, 
+        'annotations', 'kitti_{}_{}.json').format(opt.kitti_split, split)
+    self.max_objs = 50
+    self.class_name = [
+      '__background__', 'Pedestrian', 'Car', 'Cyclist']
+    self.cat_ids = {1:0, 2:1, 3:2, 4:-3, 5:-3, 6:-2, 7:-99, 8:-99, 9:-1}
+    
+    self._data_rng = np.random.RandomState(123)
+    self._eig_val = np.array([0.2141788, 0.01817699, 0.00341571],
+                             dtype=np.float32)
+    self._eig_vec = np.array([
+        [-0.58752847, -0.69563484, 0.41340352],
+        [-0.5832747, 0.00994535, -0.81221408],
+        [-0.56089297, 0.71832671, 0.41158938]
+    ], dtype=np.float32)
+    self.split = split
+    self.opt = opt
+    self.alpha_in_degree = False
+
+    print('==> initializing kitti {}, {} data.'.format(opt.kitti_split, split))
+    self.coco = coco.COCO(self.annot_path)
+    self.images = self.coco.getImgIds()
+    self.num_samples = len(self.images)
+
+    print('Loaded {} {} samples'.format(split, self.num_samples))
+
+  def __len__(self):
+    return self.num_samples
+
+  def _to_float(self, x):
+    return float("{:.2f}".format(x))
+
+  def convert_eval_format(self, all_bboxes):
+    pass
+
+  def save_results(self, results, save_dir):
+    results_dir = os.path.join(save_dir, 'results')
+    if not os.path.exists(results_dir):
+      os.mkdir(results_dir)
+    for img_id in results.keys():
+      out_path = os.path.join(results_dir, '{:06d}.txt'.format(img_id))
+      f = open(out_path, 'w')
+      for cls_ind in results[img_id]:
+        for j in range(len(results[img_id][cls_ind])):
+          class_name = self.class_name[cls_ind]
+          f.write('{} 0.0 0'.format(class_name))
+          for i in range(len(results[img_id][cls_ind][j])):
+            f.write(' {:.2f}'.format(results[img_id][cls_ind][j][i]))
+          f.write('\n')
+      f.close()
+
+  def run_eval(self, results, save_dir):
+    self.save_results(results, save_dir)
+    os.system('./tools/kitti_eval/evaluate_object_3d_offline ' + \
+              '../data/kitti/training/label_val ' + \
+              '{}/results/'.format(save_dir))
+    
diff --git a/src/lib/customdatasets/dataset/pascal.py b/src/lib/customdatasets/dataset/pascal.py
new file mode 100644
index 0000000..d6a8ca0
--- /dev/null
+++ b/src/lib/customdatasets/dataset/pascal.py
@@ -0,0 +1,82 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pycocotools.coco as coco
+import numpy as np
+import torch
+import json
+import os
+
+import torch.utils.data as data
+
+class PascalVOC(data.Dataset):
+  num_classes = 20
+  default_resolution = [384, 384]
+  mean = np.array([0.485, 0.456, 0.406],
+                   dtype=np.float32).reshape(1, 1, 3)
+  std  = np.array([0.229, 0.224, 0.225],
+                   dtype=np.float32).reshape(1, 1, 3)
+  
+  def __init__(self, opt, split):
+    super(PascalVOC, self).__init__()
+    self.data_dir = os.path.join(opt.data_dir, 'voc')
+    self.img_dir = os.path.join(self.data_dir, 'images')
+    _ann_name = {'train': 'trainval0712', 'val': 'test2007'}
+    self.annot_path = os.path.join(
+      self.data_dir, 'annotations', 
+      'pascal_{}.json').format(_ann_name[split])
+    self.max_objs = 50
+    self.class_name = ['__background__', "aeroplane", "bicycle", "bird", "boat",
+     "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", 
+     "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", 
+     "train", "tvmonitor"]
+    self._valid_ids = np.arange(1, 21, dtype=np.int32)
+    self.cat_ids = {v: i for i, v in enumerate(self._valid_ids)}
+    self._data_rng = np.random.RandomState(123)
+    self._eig_val = np.array([0.2141788, 0.01817699, 0.00341571],
+                             dtype=np.float32)
+    self._eig_vec = np.array([
+        [-0.58752847, -0.69563484, 0.41340352],
+        [-0.5832747, 0.00994535, -0.81221408],
+        [-0.56089297, 0.71832671, 0.41158938]
+    ], dtype=np.float32)
+    self.split = split
+    self.opt = opt
+
+    print('==> initializing pascal {} data.'.format(_ann_name[split]))
+    self.coco = coco.COCO(self.annot_path)
+    self.images = sorted(self.coco.getImgIds())
+    self.num_samples = len(self.images)
+
+    print('Loaded {} {} samples'.format(split, self.num_samples))
+
+  def _to_float(self, x):
+    return float("{:.2f}".format(x))
+
+  def convert_eval_format(self, all_bboxes):
+    detections = [[[] for __ in range(self.num_samples)] \
+                  for _ in range(self.num_classes + 1)]
+    for i in range(self.num_samples):
+      img_id = self.images[i]
+      for j in range(1, self.num_classes + 1):
+        if isinstance(all_bboxes[img_id][j], np.ndarray):
+          detections[j][i] = all_bboxes[img_id][j].tolist()
+        else:
+          detections[j][i] = all_bboxes[img_id][j]
+    return detections
+
+  def __len__(self):
+    return self.num_samples
+
+  def save_results(self, results, save_dir):
+    json.dump(self.convert_eval_format(results), 
+              open('{}/results.json'.format(save_dir), 'w'))
+
+  def run_eval(self, results, save_dir):
+    # result_json = os.path.join(save_dir, "results.json")
+    # detections  = self.convert_eval_format(results)
+    # json.dump(detections, open(result_json, "w"))
+    self.save_results(results, save_dir)
+    os.system('python tools/reval.py ' + \
+              '{}/results.json'.format(save_dir))
diff --git a/src/lib/customdatasets/dataset_factory.py b/src/lib/customdatasets/dataset_factory.py
new file mode 100644
index 0000000..dd2b4fd
--- /dev/null
+++ b/src/lib/customdatasets/dataset_factory.py
@@ -0,0 +1,38 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from .sample.ddd import DddDataset
+from .sample.exdet import EXDetDataset
+from .sample.ctdet import CTDetDataset
+from .sample.ctseg import CTSegDataset
+from .sample.multi_pose import MultiPoseDataset
+
+from .dataset.coco import COCO
+from .dataset.pascal import PascalVOC
+from .dataset.kitti import KITTI
+from .dataset.coco_hp import COCOHP
+
+
+dataset_factory = {
+  'coco': COCO,
+  'pascal': PascalVOC,
+  'kitti': KITTI,
+  'coco_hp': COCOHP
+}
+
+_sample_factory = {
+  'exdet': EXDetDataset,
+  'ctdet': CTDetDataset,
+  'ctseg':CTSegDataset,
+  'ddd': DddDataset,
+  'multi_pose': MultiPoseDataset
+  
+}
+
+
+def get_dataset(dataset, task):
+  class Dataset(dataset_factory[dataset], _sample_factory[task]):
+    pass
+  return Dataset
+  
diff --git a/src/lib/customdatasets/sample/ctdet.py b/src/lib/customdatasets/sample/ctdet.py
new file mode 100644
index 0000000..6dc09ae
--- /dev/null
+++ b/src/lib/customdatasets/sample/ctdet.py
@@ -0,0 +1,145 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch.utils.data as data
+import numpy as np
+import torch
+import json
+import cv2
+import os
+from utils.image import flip, color_aug
+from utils.image import get_affine_transform, affine_transform
+from utils.image import gaussian_radius, draw_umich_gaussian, draw_msra_gaussian
+from utils.image import draw_dense_reg
+import math
+
+class CTDetDataset(data.Dataset):
+  def _coco_box_to_bbox(self, box):
+    bbox = np.array([box[0], box[1], box[0] + box[2], box[1] + box[3]],
+                    dtype=np.float32)
+    return bbox
+
+  def _get_border(self, border, size):
+    i = 1
+    while size - border // i <= border // i:
+        i *= 2
+    return border // i
+
+  def __getitem__(self, index):
+    img_id = self.images[index]
+    file_name = self.coco.loadImgs(ids=[img_id])[0]['file_name']
+    img_path = os.path.join(self.img_dir, file_name)
+    ann_ids = self.coco.getAnnIds(imgIds=[img_id])
+    anns = self.coco.loadAnns(ids=ann_ids)
+    num_objs = min(len(anns), self.max_objs)
+
+    img = cv2.imread(img_path)
+
+    height, width = img.shape[0], img.shape[1]
+    c = np.array([img.shape[1] / 2., img.shape[0] / 2.], dtype=np.float32)
+    if self.opt.keep_res:
+      input_h = (height | self.opt.pad) + 1
+      input_w = (width | self.opt.pad) + 1
+      s = np.array([input_w, input_h], dtype=np.float32)
+    else:
+      s = max(img.shape[0], img.shape[1]) * 1.0
+      input_h, input_w = self.opt.input_h, self.opt.input_w
+    
+    flipped = False
+    if self.split == 'train':
+      if not self.opt.not_rand_crop:
+        s = s * np.random.choice(np.arange(0.6, 1.4, 0.1))
+        w_border = self._get_border(128, img.shape[1])
+        h_border = self._get_border(128, img.shape[0])
+        c[0] = np.random.randint(low=w_border, high=img.shape[1] - w_border)
+        c[1] = np.random.randint(low=h_border, high=img.shape[0] - h_border)
+      else:
+        sf = self.opt.scale
+        cf = self.opt.shift
+        c[0] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
+        c[1] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
+        s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf)
+      
+      if np.random.random() < self.opt.flip:
+        flipped = True
+        img = img[:, ::-1, :]
+        c[0] =  width - c[0] - 1
+        
+
+    trans_input = get_affine_transform(
+      c, s, 0, [input_w, input_h])
+    inp = cv2.warpAffine(img, trans_input, 
+                         (input_w, input_h),
+                         flags=cv2.INTER_LINEAR)
+    inp = (inp.astype(np.float32) / 255.)
+    if self.split == 'train' and not self.opt.no_color_aug:
+      color_aug(self._data_rng, inp, self._eig_val, self._eig_vec)
+    inp = (inp - self.mean) / self.std
+    inp = inp.transpose(2, 0, 1)
+
+    output_h = input_h // self.opt.down_ratio
+    output_w = input_w // self.opt.down_ratio
+    num_classes = self.num_classes
+    trans_output = get_affine_transform(c, s, 0, [output_w, output_h])
+
+    hm = np.zeros((num_classes, output_h, output_w), dtype=np.float32)
+    wh = np.zeros((self.max_objs, 2), dtype=np.float32)
+    dense_wh = np.zeros((2, output_h, output_w), dtype=np.float32)
+    reg = np.zeros((self.max_objs, 2), dtype=np.float32)
+    ind = np.zeros((self.max_objs), dtype=np.int64)
+    reg_mask = np.zeros((self.max_objs), dtype=np.uint8)
+    cat_spec_wh = np.zeros((self.max_objs, num_classes * 2), dtype=np.float32)
+    cat_spec_mask = np.zeros((self.max_objs, num_classes * 2), dtype=np.uint8)
+    
+    draw_gaussian = draw_msra_gaussian if self.opt.mse_loss else \
+                    draw_umich_gaussian
+
+    gt_det = []
+    for k in range(num_objs):
+      ann = anns[k]
+      bbox = self._coco_box_to_bbox(ann['bbox'])
+      cls_id = int(self.cat_ids[ann['category_id']])
+      if flipped:
+        bbox[[0, 2]] = width - bbox[[2, 0]] - 1
+      bbox[:2] = affine_transform(bbox[:2], trans_output)
+      bbox[2:] = affine_transform(bbox[2:], trans_output)
+      bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1)
+      bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1)
+      h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
+      if h > 0 and w > 0:
+        radius = gaussian_radius((math.ceil(h), math.ceil(w)))
+        radius = max(0, int(radius))
+        radius = self.opt.hm_gauss if self.opt.mse_loss else radius
+        ct = np.array(
+          [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32)
+        ct_int = ct.astype(np.int32)
+        draw_gaussian(hm[cls_id], ct_int, radius)
+        wh[k] = 1. * w, 1. * h
+        ind[k] = ct_int[1] * output_w + ct_int[0]
+        reg[k] = ct - ct_int
+        reg_mask[k] = 1
+        cat_spec_wh[k, cls_id * 2: cls_id * 2 + 2] = wh[k]
+        cat_spec_mask[k, cls_id * 2: cls_id * 2 + 2] = 1
+        if self.opt.dense_wh:
+          draw_dense_reg(dense_wh, hm.max(axis=0), ct_int, wh[k], radius)
+        gt_det.append([ct[0] - w / 2, ct[1] - h / 2, 
+                       ct[0] + w / 2, ct[1] + h / 2, 1, cls_id])
+    
+    ret = {'input': inp, 'hm': hm, 'reg_mask': reg_mask, 'ind': ind, 'wh': wh}
+    if self.opt.dense_wh:
+      hm_a = hm.max(axis=0, keepdims=True)
+      dense_wh_mask = np.concatenate([hm_a, hm_a], axis=0)
+      ret.update({'dense_wh': dense_wh, 'dense_wh_mask': dense_wh_mask})
+      del ret['wh']
+    elif self.opt.cat_spec_wh:
+      ret.update({'cat_spec_wh': cat_spec_wh, 'cat_spec_mask': cat_spec_mask})
+      del ret['wh']
+    if self.opt.reg_offset:
+      ret.update({'reg': reg})
+    if self.opt.debug > 0 or not self.split == 'train':
+      gt_det = np.array(gt_det, dtype=np.float32) if len(gt_det) > 0 else \
+               np.zeros((1, 6), dtype=np.float32)
+      meta = {'c': c, 's': s, 'gt_det': gt_det, 'img_id': img_id}
+      ret['meta'] = meta
+    return ret
\ No newline at end of file
diff --git a/src/lib/customdatasets/sample/ctseg.py b/src/lib/customdatasets/sample/ctseg.py
new file mode 100644
index 0000000..d339024
--- /dev/null
+++ b/src/lib/customdatasets/sample/ctseg.py
@@ -0,0 +1,164 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch.utils.data as data
+import numpy as np
+import torch
+import json
+import cv2
+import os
+from utils.image import flip, color_aug
+from utils.image import get_affine_transform, affine_transform
+from utils.image import gaussian_radius, draw_umich_gaussian, draw_msra_gaussian
+from utils.image import draw_dense_reg
+import math
+
+
+class CTSegDataset(data.Dataset):
+    def _coco_box_to_bbox(self, box):
+        bbox = np.array([box[0], box[1], box[0] + box[2], box[1] + box[3]],
+                        dtype=np.float32)
+        return bbox
+
+    def _get_border(self, border, size):
+        i = 1
+        while size - border // i <= border // i:
+            i *= 2
+        return border // i
+
+    def __getitem__(self, index):
+        img_id = self.images[index]
+        file_name = self.coco.loadImgs(ids=[img_id])[0]['file_name']
+        img_path = os.path.join(self.img_dir, file_name)
+        ann_ids = self.coco.getAnnIds(imgIds=[img_id])
+        anns = self.coco.loadAnns(ids=ann_ids)
+        num_objs = min(len(anns), self.max_objs)
+
+        img = cv2.imread(img_path)
+
+        height, width = img.shape[0], img.shape[1]
+        c = np.array([img.shape[1] / 2., img.shape[0] / 2.], dtype=np.float32)
+        if self.opt.keep_res:
+            input_h = (height | self.opt.pad) + 1
+            input_w = (width | self.opt.pad) + 1
+            s = np.array([input_w, input_h], dtype=np.float32)
+        else:
+            s = max(img.shape[0], img.shape[1]) * 1.0
+            input_h, input_w = self.opt.input_h, self.opt.input_w
+
+        flipped = False
+        if self.split == 'train':
+            if not self.opt.not_rand_crop:
+                s = s * np.random.choice(np.arange(0.6, 1.4, 0.1))
+                w_border = self._get_border(128, img.shape[1])
+                h_border = self._get_border(128, img.shape[0])
+                c[0] = np.random.randint(
+                    low=w_border, high=img.shape[1] - w_border)
+                c[1] = np.random.randint(
+                    low=h_border, high=img.shape[0] - h_border)
+            else:
+                sf = self.opt.scale
+                cf = self.opt.shift
+                c[0] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf)
+                c[1] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf)
+                s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
+
+            if np.random.random() < self.opt.flip:
+                flipped = True
+                img = img[:, ::-1, :]
+                c[0] = width - c[0] - 1
+
+        trans_input = get_affine_transform(
+            c, s, 0, [input_w, input_h])
+        inp = cv2.warpAffine(img, trans_input,
+                             (input_w, input_h),
+                             flags=cv2.INTER_LINEAR)
+        inp = (inp.astype(np.float32) / 255.)
+        if self.split == 'train' and not self.opt.no_color_aug:
+            color_aug(self._data_rng, inp, self._eig_val, self._eig_vec)
+        inp = (inp - self.mean) / self.std
+        inp = inp.transpose(2, 0, 1)
+
+        output_h = input_h // self.opt.down_ratio
+        output_w = input_w // self.opt.down_ratio
+        num_classes = self.num_classes
+        trans_output = get_affine_transform(c, s, 0, [output_w, output_h])
+
+        hm = np.zeros((num_classes, output_h, output_w), dtype=np.float32)
+        wh = np.zeros((self.max_objs, 2), dtype=np.float32)
+        dense_wh = np.zeros((2, output_h, output_w), dtype=np.float32)
+        reg = np.zeros((self.max_objs, 2), dtype=np.float32)
+        ind = np.zeros((self.max_objs), dtype=np.int64)
+        reg_mask = np.zeros((self.max_objs), dtype=np.uint8)
+        cat_spec_wh = np.zeros(
+            (self.max_objs, num_classes * 2), dtype=np.float32)
+        cat_spec_mask = np.zeros(
+            (self.max_objs, num_classes * 2), dtype=np.uint8)
+        instance_masks = np.zeros(
+            (num_objs if num_objs > 0 else 1, output_h, output_w), dtype=np.float32)
+        draw_gaussian = draw_msra_gaussian if self.opt.mse_loss else \
+            draw_umich_gaussian
+
+        gt_det = []
+
+        for k in range(num_objs):
+
+            ann = anns[k]
+            instance_mask = self.coco.annToMask(ann)
+
+            bbox = self._coco_box_to_bbox(ann['bbox'])
+            cls_id = int(self.cat_ids[ann['category_id']])
+            if flipped:
+                bbox[[0, 2]] = width - bbox[[2, 0]] - 1
+                instance_mask = instance_mask[:, ::-1]
+            bbox[:2] = affine_transform(bbox[:2], trans_output)
+            bbox[2:] = affine_transform(bbox[2:], trans_output)
+            bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1)
+            bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1)
+            instance_mask = cv2.warpAffine(instance_mask, trans_output,
+                                           (output_w, output_h),
+                                           flags=cv2.INTER_LINEAR)
+            instance_mask = instance_mask.astype(np.float32)
+
+            h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
+            if h > 0 and w > 0:
+                radius = gaussian_radius((math.ceil(h), math.ceil(w)))
+                radius = max(0, int(radius))
+                radius = self.opt.hm_gauss if self.opt.mse_loss else radius
+                ct = np.array(
+                    [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32)
+                ct_int = ct.astype(np.int32)
+                draw_gaussian(hm[cls_id], ct_int, radius)
+                wh[k] = 1. * w, 1. * h
+                ind[k] = ct_int[1] * output_w + ct_int[0]
+                reg[k] = ct - ct_int
+                reg_mask[k] = 1
+                cat_spec_wh[k, cls_id * 2: cls_id * 2 + 2] = wh[k]
+                cat_spec_mask[k, cls_id * 2: cls_id * 2 + 2] = 1
+                instance_masks[k] = instance_mask
+                if self.opt.dense_wh:
+                    draw_dense_reg(dense_wh, hm.max(
+                        axis=0), ct_int, wh[k], radius)
+                gt_det.append([ct[0] - w / 2, ct[1] - h / 2,
+                               ct[0] + w / 2, ct[1] + h / 2, 1, cls_id])
+
+        ret = {'input': inp, 'hm': hm, 'reg_mask': reg_mask, 'ind': ind, 'wh': wh,
+               "instance_mask": instance_masks}
+        if self.opt.dense_wh:
+            hm_a = hm.max(axis=0, keepdims=True)
+            dense_wh_mask = np.concatenate([hm_a, hm_a], axis=0)
+            ret.update({'dense_wh': dense_wh, 'dense_wh_mask': dense_wh_mask})
+            del ret['wh']
+        elif self.opt.cat_spec_wh:
+            ret.update({'cat_spec_wh': cat_spec_wh,
+                        'cat_spec_mask': cat_spec_mask})
+            del ret['wh']
+        if self.opt.reg_offset:
+            ret.update({'reg': reg})
+        if self.opt.debug > 0 or not self.split == 'train':
+            gt_det = np.array(gt_det, dtype=np.float32) if len(gt_det) > 0 else \
+                np.zeros((1, 6), dtype=np.float32)
+            meta = {'c': c, 's': s, 'gt_det': gt_det, 'img_id': img_id}
+            ret['meta'] = meta
+        return ret
diff --git a/src/lib/customdatasets/sample/ddd.py b/src/lib/customdatasets/sample/ddd.py
new file mode 100644
index 0000000..c7426e0
--- /dev/null
+++ b/src/lib/customdatasets/sample/ddd.py
@@ -0,0 +1,170 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch.utils.data as data
+import pycocotools.coco as coco
+import numpy as np
+import torch
+import json
+import cv2
+import os
+import math
+from utils.image import flip, color_aug
+from utils.image import get_affine_transform, affine_transform
+from utils.image import gaussian_radius, draw_umich_gaussian, draw_msra_gaussian
+import pycocotools.coco as coco
+
+class DddDataset(data.Dataset):
+  def _coco_box_to_bbox(self, box):
+    bbox = np.array([box[0], box[1], box[0] + box[2], box[1] + box[3]],
+                    dtype=np.float32)
+    return bbox
+
+  def _convert_alpha(self, alpha):
+    return math.radians(alpha + 45) if self.alpha_in_degree else alpha
+
+  def __getitem__(self, index):
+    img_id = self.images[index]
+    img_info = self.coco.loadImgs(ids=[img_id])[0]
+    img_path = os.path.join(self.img_dir, img_info['file_name'])
+    img = cv2.imread(img_path)
+    if 'calib' in img_info:
+      calib = np.array(img_info['calib'], dtype=np.float32)
+    else:
+      calib = self.calib
+
+    height, width = img.shape[0], img.shape[1]
+    c = np.array([img.shape[1] / 2., img.shape[0] / 2.])
+    if self.opt.keep_res:
+      s = np.array([self.opt.input_w, self.opt.input_h], dtype=np.int32)
+    else:
+      s = np.array([width, height], dtype=np.int32)
+    
+    aug = False
+    if self.split == 'train' and np.random.random() < self.opt.aug_ddd:
+      aug = True
+      sf = self.opt.scale
+      cf = self.opt.shift
+      s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf)
+      c[0] += img.shape[1] * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
+      c[1] += img.shape[0] * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
+
+    trans_input = get_affine_transform(
+      c, s, 0, [self.opt.input_w, self.opt.input_h])
+    inp = cv2.warpAffine(img, trans_input, 
+                         (self.opt.input_w, self.opt.input_h),
+                         flags=cv2.INTER_LINEAR)
+    inp = (inp.astype(np.float32) / 255.)
+    # if self.split == 'train' and not self.opt.no_color_aug:
+    #   color_aug(self._data_rng, inp, self._eig_val, self._eig_vec)
+    inp = (inp - self.mean) / self.std
+    inp = inp.transpose(2, 0, 1)
+
+    num_classes = self.opt.num_classes
+    trans_output = get_affine_transform(
+      c, s, 0, [self.opt.output_w, self.opt.output_h])
+
+    hm = np.zeros(
+      (num_classes, self.opt.output_h, self.opt.output_w), dtype=np.float32)
+    wh = np.zeros((self.max_objs, 2), dtype=np.float32)
+    reg = np.zeros((self.max_objs, 2), dtype=np.float32)
+    dep = np.zeros((self.max_objs, 1), dtype=np.float32)
+    rotbin = np.zeros((self.max_objs, 2), dtype=np.int64)
+    rotres = np.zeros((self.max_objs, 2), dtype=np.float32)
+    dim = np.zeros((self.max_objs, 3), dtype=np.float32)
+    ind = np.zeros((self.max_objs), dtype=np.int64)
+    reg_mask = np.zeros((self.max_objs), dtype=np.uint8)
+    rot_mask = np.zeros((self.max_objs), dtype=np.uint8)
+
+    ann_ids = self.coco.getAnnIds(imgIds=[img_id])
+    anns = self.coco.loadAnns(ids=ann_ids)
+    num_objs = min(len(anns), self.max_objs)
+    draw_gaussian = draw_msra_gaussian if self.opt.mse_loss else \
+                    draw_umich_gaussian
+    gt_det = []
+    for k in range(num_objs):
+      ann = anns[k]
+      bbox = self._coco_box_to_bbox(ann['bbox'])
+      cls_id = int(self.cat_ids[ann['category_id']])
+      if cls_id <= -99:
+        continue
+      # if flipped:
+      #   bbox[[0, 2]] = width - bbox[[2, 0]] - 1
+      bbox[:2] = affine_transform(bbox[:2], trans_output)
+      bbox[2:] = affine_transform(bbox[2:], trans_output)
+      bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, self.opt.output_w - 1)
+      bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, self.opt.output_h - 1)
+      h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
+      if h > 0 and w > 0:
+        radius = gaussian_radius((h, w))
+        radius = max(0, int(radius))
+        ct = np.array(
+          [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32)
+        ct_int = ct.astype(np.int32)
+        if cls_id < 0:
+          ignore_id = [_ for _ in range(num_classes)] \
+                      if cls_id == - 1 else  [- cls_id - 2]
+          if self.opt.rect_mask:
+            hm[ignore_id, int(bbox[1]): int(bbox[3]) + 1, 
+              int(bbox[0]): int(bbox[2]) + 1] = 0.9999
+          else:
+            for cc in ignore_id:
+              draw_gaussian(hm[cc], ct, radius)
+            hm[ignore_id, ct_int[1], ct_int[0]] = 0.9999
+          continue
+        draw_gaussian(hm[cls_id], ct, radius)
+
+        wh[k] = 1. * w, 1. * h
+        gt_det.append([ct[0], ct[1], 1] + \
+                      self._alpha_to_8(self._convert_alpha(ann['alpha'])) + \
+                      [ann['depth']] + (np.array(ann['dim']) / 1).tolist() + [cls_id])
+        if self.opt.reg_bbox:
+          gt_det[-1] = gt_det[-1][:-1] + [w, h] + [gt_det[-1][-1]]
+        # if (not self.opt.car_only) or cls_id == 1: # Only estimate ADD for cars !!!
+        if 1:
+          alpha = self._convert_alpha(ann['alpha'])
+          # print('img_id cls_id alpha rot_y', img_path, cls_id, alpha, ann['rotation_y'])
+          if alpha < np.pi / 6. or alpha > 5 * np.pi / 6.:
+            rotbin[k, 0] = 1
+            rotres[k, 0] = alpha - (-0.5 * np.pi)    
+          if alpha > -np.pi / 6. or alpha < -5 * np.pi / 6.:
+            rotbin[k, 1] = 1
+            rotres[k, 1] = alpha - (0.5 * np.pi)
+          dep[k] = ann['depth']
+          dim[k] = ann['dim']
+          # print('        cat dim', cls_id, dim[k])
+          ind[k] = ct_int[1] * self.opt.output_w + ct_int[0]
+          reg[k] = ct - ct_int
+          reg_mask[k] = 1 if not aug else 0
+          rot_mask[k] = 1
+    # print('gt_det', gt_det)
+    # print('')
+    ret = {'input': inp, 'hm': hm, 'dep': dep, 'dim': dim, 'ind': ind, 
+           'rotbin': rotbin, 'rotres': rotres, 'reg_mask': reg_mask,
+           'rot_mask': rot_mask}
+    if self.opt.reg_bbox:
+      ret.update({'wh': wh})
+    if self.opt.reg_offset:
+      ret.update({'reg': reg})
+    if self.opt.debug > 0 or not ('train' in self.split):
+      gt_det = np.array(gt_det, dtype=np.float32) if len(gt_det) > 0 else \
+               np.zeros((1, 18), dtype=np.float32)
+      meta = {'c': c, 's': s, 'gt_det': gt_det, 'calib': calib,
+              'image_path': img_path, 'img_id': img_id}
+      ret['meta'] = meta
+    
+    return ret
+
+  def _alpha_to_8(self, alpha):
+    # return [alpha, 0, 0, 0, 0, 0, 0, 0]
+    ret = [0, 0, 0, 1, 0, 0, 0, 1]
+    if alpha < np.pi / 6. or alpha > 5 * np.pi / 6.:
+      r = alpha - (-0.5 * np.pi)
+      ret[1] = 1
+      ret[2], ret[3] = np.sin(r), np.cos(r)
+    if alpha > -np.pi / 6. or alpha < -5 * np.pi / 6.:
+      r = alpha - (0.5 * np.pi)
+      ret[5] = 1
+      ret[6], ret[7] = np.sin(r), np.cos(r)
+    return ret
diff --git a/src/lib/customdatasets/sample/exdet.py b/src/lib/customdatasets/sample/exdet.py
new file mode 100644
index 0000000..8518bed
--- /dev/null
+++ b/src/lib/customdatasets/sample/exdet.py
@@ -0,0 +1,137 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch.utils.data as data
+import pycocotools.coco as coco
+import numpy as np
+import torch
+import json
+import cv2
+import os
+from utils.image import flip, color_aug
+from utils.image import get_affine_transform, affine_transform
+from utils.image import gaussian_radius, draw_umich_gaussian, draw_msra_gaussian
+import pycocotools.coco as coco
+import math
+
+class EXDetDataset(data.Dataset):
+  def _coco_box_to_bbox(self, box):
+    bbox = np.array([box[0], box[1], box[0] + box[2], box[1] + box[3]],
+                    dtype=np.float32)
+    return bbox
+
+  def _get_border(self, border, size):
+    i = 1
+    while size - border // i <= border // i:
+        i *= 2
+    return border // i
+
+  def __getitem__(self, index):
+    img_id = self.images[index]
+    img_info = self.coco.loadImgs(ids=[img_id])[0]
+    img_path = os.path.join(self.img_dir, img_info['file_name'])
+    img = cv2.imread(img_path)
+
+    height, width = img.shape[0], img.shape[1]
+    c = np.array([img.shape[1] / 2., img.shape[0] / 2.])
+    s = max(img.shape[0], img.shape[1]) * 1.0
+    
+    flipped = False
+    if self.split == 'train':
+      if not self.opt.not_rand_crop:
+        s = s * np.random.choice(np.arange(0.6, 1.4, 0.1))
+        w_border = self._get_border(128, img.shape[1])
+        h_border = self._get_border(128, img.shape[0])
+        c[0] = np.random.randint(low=w_border, high=img.shape[1] - w_border)
+        c[1] = np.random.randint(low=h_border, high=img.shape[0] - h_border)
+      else:
+        sf = self.opt.scale
+        cf = self.opt.shift
+        s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf)
+        c[0] += img.shape[1] * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
+        c[1] += img.shape[0] * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
+      if np.random.random() < self.opt.flip:
+        flipped = True
+        img = img[:, ::-1, :]
+
+    trans_input = get_affine_transform(
+      c, s, 0, [self.opt.input_res, self.opt.input_res])
+    inp = cv2.warpAffine(img, trans_input, 
+                         (self.opt.input_res, self.opt.input_res),
+                         flags=cv2.INTER_LINEAR)
+    inp = (inp.astype(np.float32) / 255.)
+    if self.split == 'train' and not self.opt.no_color_aug:
+      color_aug(self._data_rng, inp, self._eig_val, self._eig_vec)
+    inp = (inp - self.mean) / self.std
+    inp = inp.transpose(2, 0, 1)
+
+    output_res = self.opt.output_res
+    num_classes = self.opt.num_classes
+    trans_output = get_affine_transform(c, s, 0, [output_res, output_res])
+    num_hm = 1 if self.opt.agnostic_ex else num_classes
+
+    hm_t = np.zeros((num_hm, output_res, output_res), dtype=np.float32)
+    hm_l = np.zeros((num_hm, output_res, output_res), dtype=np.float32)
+    hm_b = np.zeros((num_hm, output_res, output_res), dtype=np.float32)
+    hm_r = np.zeros((num_hm, output_res, output_res), dtype=np.float32)
+    hm_c = np.zeros((num_classes, output_res, output_res), dtype=np.float32)
+    reg_t = np.zeros((self.max_objs, 2), dtype=np.float32)
+    reg_l = np.zeros((self.max_objs, 2), dtype=np.float32)
+    reg_b = np.zeros((self.max_objs, 2), dtype=np.float32)
+    reg_r = np.zeros((self.max_objs, 2), dtype=np.float32)
+    ind_t = np.zeros((self.max_objs), dtype=np.int64)
+    ind_l = np.zeros((self.max_objs), dtype=np.int64)
+    ind_b = np.zeros((self.max_objs), dtype=np.int64)
+    ind_r = np.zeros((self.max_objs), dtype=np.int64)
+    reg_mask = np.zeros((self.max_objs), dtype=np.uint8)
+    
+    ann_ids = self.coco.getAnnIds(imgIds=[img_id])
+    anns = self.coco.loadAnns(ids=ann_ids)
+    num_objs = min(len(anns), self.max_objs)
+    draw_gaussian = draw_msra_gaussian if self.opt.mse_loss else \
+                    draw_umich_gaussian
+
+    for k in range(num_objs):
+      ann = anns[k]
+      # bbox = self._coco_box_to_bbox(ann['bbox'])
+      # tlbr
+      pts = np.array(ann['extreme_points'], dtype=np.float32).reshape(4, 2)
+      # cls_id = int(self.cat_ids[ann['category_id']] - 1) # bug
+      cls_id = int(self.cat_ids[ann['category_id']])
+      hm_id = 0 if self.opt.agnostic_ex else cls_id
+      if flipped:
+        pts[:, 0] = width - pts[:, 0] - 1
+        pts[1], pts[3] = pts[3].copy(), pts[1].copy()
+      for j in range(4):
+        pts[j] = affine_transform(pts[j], trans_output)
+      pts = np.clip(pts, 0, self.opt.output_res - 1)
+      h, w = pts[2, 1] - pts[0, 1], pts[3, 0] - pts[1, 0]
+      if h > 0 and w > 0:
+        radius = gaussian_radius((math.ceil(h), math.ceil(w)))
+        radius = max(0, int(radius))
+        pt_int = pts.astype(np.int32)
+        draw_gaussian(hm_t[hm_id], pt_int[0], radius)
+        draw_gaussian(hm_l[hm_id], pt_int[1], radius)
+        draw_gaussian(hm_b[hm_id], pt_int[2], radius)
+        draw_gaussian(hm_r[hm_id], pt_int[3], radius)
+        reg_t[k] = pts[0] - pt_int[0]
+        reg_l[k] = pts[1] - pt_int[1]
+        reg_b[k] = pts[2] - pt_int[2]
+        reg_r[k] = pts[3] - pt_int[3]
+        ind_t[k] = pt_int[0, 1] * output_res + pt_int[0, 0]
+        ind_l[k] = pt_int[1, 1] * output_res + pt_int[1, 0]
+        ind_b[k] = pt_int[2, 1] * output_res + pt_int[2, 0]
+        ind_r[k] = pt_int[3, 1] * output_res + pt_int[3, 0]
+
+        ct = [int((pts[3, 0] + pts[1, 0]) / 2), int((pts[0, 1] + pts[2, 1]) / 2)]
+        draw_gaussian(hm_c[cls_id], ct, radius)
+        reg_mask[k] = 1
+    ret = {'input': inp, 'hm_t': hm_t, 'hm_l': hm_l, 'hm_b': hm_b, 
+            'hm_r': hm_r, 'hm_c': hm_c}
+    if self.opt.reg_offset:
+      ret.update({'reg_mask': reg_mask,
+        'reg_t': reg_t, 'reg_l': reg_l, 'reg_b': reg_b, 'reg_r': reg_r,
+        'ind_t': ind_t, 'ind_l': ind_l, 'ind_b': ind_b, 'ind_r': ind_r})
+    
+    return ret
\ No newline at end of file
diff --git a/src/lib/customdatasets/sample/multi_pose.py b/src/lib/customdatasets/sample/multi_pose.py
new file mode 100644
index 0000000..5c23806
--- /dev/null
+++ b/src/lib/customdatasets/sample/multi_pose.py
@@ -0,0 +1,183 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch.utils.data as data
+import numpy as np
+import torch
+import json
+import cv2
+import os
+from utils.image import flip, color_aug
+from utils.image import get_affine_transform, affine_transform
+from utils.image import gaussian_radius, draw_umich_gaussian, draw_msra_gaussian
+from utils.image import draw_dense_reg
+import math
+
+class MultiPoseDataset(data.Dataset):
+  def _coco_box_to_bbox(self, box):
+    bbox = np.array([box[0], box[1], box[0] + box[2], box[1] + box[3]],
+                    dtype=np.float32)
+    return bbox
+
+  def _get_border(self, border, size):
+    i = 1
+    while size - border // i <= border // i:
+        i *= 2
+    return border // i
+
+  def __getitem__(self, index):
+    img_id = self.images[index]
+    file_name = self.coco.loadImgs(ids=[img_id])[0]['file_name']
+    img_path = os.path.join(self.img_dir, file_name)
+    ann_ids = self.coco.getAnnIds(imgIds=[img_id])
+    anns = self.coco.loadAnns(ids=ann_ids)
+    num_objs = min(len(anns), self.max_objs)
+
+    img = cv2.imread(img_path)
+
+    height, width = img.shape[0], img.shape[1]
+    c = np.array([img.shape[1] / 2., img.shape[0] / 2.], dtype=np.float32)
+    s = max(img.shape[0], img.shape[1]) * 1.0
+    rot = 0
+
+    flipped = False
+    if self.split == 'train':
+      if not self.opt.not_rand_crop:
+        s = s * np.random.choice(np.arange(0.6, 1.4, 0.1))
+        w_border = self._get_border(128, img.shape[1])
+        h_border = self._get_border(128, img.shape[0])
+        c[0] = np.random.randint(low=w_border, high=img.shape[1] - w_border)
+        c[1] = np.random.randint(low=h_border, high=img.shape[0] - h_border)
+      else:
+        sf = self.opt.scale
+        cf = self.opt.shift
+        c[0] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
+        c[1] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
+        s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf)
+      if np.random.random() < self.opt.aug_rot:
+        rf = self.opt.rotate
+        rot = np.clip(np.random.randn()*rf, -rf*2, rf*2)
+
+      if np.random.random() < self.opt.flip:
+        flipped = True
+        img = img[:, ::-1, :]
+        c[0] =  width - c[0] - 1
+        
+
+    trans_input = get_affine_transform(
+      c, s, rot, [self.opt.input_res, self.opt.input_res])
+    inp = cv2.warpAffine(img, trans_input, 
+                         (self.opt.input_res, self.opt.input_res),
+                         flags=cv2.INTER_LINEAR)
+    inp = (inp.astype(np.float32) / 255.)
+    if self.split == 'train' and not self.opt.no_color_aug:
+      color_aug(self._data_rng, inp, self._eig_val, self._eig_vec)
+    inp = (inp - self.mean) / self.std
+    inp = inp.transpose(2, 0, 1)
+
+    output_res = self.opt.output_res
+    num_joints = self.num_joints
+    trans_output_rot = get_affine_transform(c, s, rot, [output_res, output_res])
+    trans_output = get_affine_transform(c, s, 0, [output_res, output_res])
+
+    hm = np.zeros((self.num_classes, output_res, output_res), dtype=np.float32)
+    hm_hp = np.zeros((num_joints, output_res, output_res), dtype=np.float32)
+    dense_kps = np.zeros((num_joints, 2, output_res, output_res), 
+                          dtype=np.float32)
+    dense_kps_mask = np.zeros((num_joints, output_res, output_res), 
+                               dtype=np.float32)
+    wh = np.zeros((self.max_objs, 2), dtype=np.float32)
+    kps = np.zeros((self.max_objs, num_joints * 2), dtype=np.float32)
+    reg = np.zeros((self.max_objs, 2), dtype=np.float32)
+    ind = np.zeros((self.max_objs), dtype=np.int64)
+    reg_mask = np.zeros((self.max_objs), dtype=np.uint8)
+    kps_mask = np.zeros((self.max_objs, self.num_joints * 2), dtype=np.uint8)
+    hp_offset = np.zeros((self.max_objs * num_joints, 2), dtype=np.float32)
+    hp_ind = np.zeros((self.max_objs * num_joints), dtype=np.int64)
+    hp_mask = np.zeros((self.max_objs * num_joints), dtype=np.int64)
+
+    draw_gaussian = draw_msra_gaussian if self.opt.mse_loss else \
+                    draw_umich_gaussian
+
+    gt_det = []
+    for k in range(num_objs):
+      ann = anns[k]
+      bbox = self._coco_box_to_bbox(ann['bbox'])
+      cls_id = int(ann['category_id']) - 1
+      pts = np.array(ann['keypoints'], np.float32).reshape(num_joints, 3)
+      if flipped:
+        bbox[[0, 2]] = width - bbox[[2, 0]] - 1
+        pts[:, 0] = width - pts[:, 0] - 1
+        for e in self.flip_idx:
+          pts[e[0]], pts[e[1]] = pts[e[1]].copy(), pts[e[0]].copy()
+      bbox[:2] = affine_transform(bbox[:2], trans_output)
+      bbox[2:] = affine_transform(bbox[2:], trans_output)
+      bbox = np.clip(bbox, 0, output_res - 1)
+      h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
+      if (h > 0 and w > 0) or (rot != 0):
+        radius = gaussian_radius((math.ceil(h), math.ceil(w)))
+        radius = self.opt.hm_gauss if self.opt.mse_loss else max(0, int(radius)) 
+        ct = np.array(
+          [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], dtype=np.float32)
+        ct_int = ct.astype(np.int32)
+        wh[k] = 1. * w, 1. * h
+        ind[k] = ct_int[1] * output_res + ct_int[0]
+        reg[k] = ct - ct_int
+        reg_mask[k] = 1
+        num_kpts = pts[:, 2].sum()
+        if num_kpts == 0:
+          hm[cls_id, ct_int[1], ct_int[0]] = 0.9999
+          reg_mask[k] = 0
+
+        hp_radius = gaussian_radius((math.ceil(h), math.ceil(w)))
+        hp_radius = self.opt.hm_gauss \
+                    if self.opt.mse_loss else max(0, int(hp_radius)) 
+        for j in range(num_joints):
+          if pts[j, 2] > 0:
+            pts[j, :2] = affine_transform(pts[j, :2], trans_output_rot)
+            if pts[j, 0] >= 0 and pts[j, 0] < output_res and \
+               pts[j, 1] >= 0 and pts[j, 1] < output_res:
+              kps[k, j * 2: j * 2 + 2] = pts[j, :2] - ct_int
+              kps_mask[k, j * 2: j * 2 + 2] = 1
+              pt_int = pts[j, :2].astype(np.int32)
+              hp_offset[k * num_joints + j] = pts[j, :2] - pt_int
+              hp_ind[k * num_joints + j] = pt_int[1] * output_res + pt_int[0]
+              hp_mask[k * num_joints + j] = 1
+              if self.opt.dense_hp:
+                # must be before draw center hm gaussian
+                draw_dense_reg(dense_kps[j], hm[cls_id], ct_int, 
+                               pts[j, :2] - ct_int, radius, is_offset=True)
+                draw_gaussian(dense_kps_mask[j], ct_int, radius)
+              draw_gaussian(hm_hp[j], pt_int, hp_radius)
+        draw_gaussian(hm[cls_id], ct_int, radius)
+        gt_det.append([ct[0] - w / 2, ct[1] - h / 2, 
+                       ct[0] + w / 2, ct[1] + h / 2, 1] + 
+                       pts[:, :2].reshape(num_joints * 2).tolist() + [cls_id])
+    if rot != 0:
+      hm = hm * 0 + 0.9999
+      reg_mask *= 0
+      kps_mask *= 0
+    ret = {'input': inp, 'hm': hm, 'reg_mask': reg_mask, 'ind': ind, 'wh': wh,
+           'hps': kps, 'hps_mask': kps_mask}
+    if self.opt.dense_hp:
+      dense_kps = dense_kps.reshape(num_joints * 2, output_res, output_res)
+      dense_kps_mask = dense_kps_mask.reshape(
+        num_joints, 1, output_res, output_res)
+      dense_kps_mask = np.concatenate([dense_kps_mask, dense_kps_mask], axis=1)
+      dense_kps_mask = dense_kps_mask.reshape(
+        num_joints * 2, output_res, output_res)
+      ret.update({'dense_hps': dense_kps, 'dense_hps_mask': dense_kps_mask})
+      del ret['hps'], ret['hps_mask']
+    if self.opt.reg_offset:
+      ret.update({'reg': reg})
+    if self.opt.hm_hp:
+      ret.update({'hm_hp': hm_hp})
+    if self.opt.reg_hp_offset:
+      ret.update({'hp_offset': hp_offset, 'hp_ind': hp_ind, 'hp_mask': hp_mask})
+    if self.opt.debug > 0 or not self.split == 'train':
+      gt_det = np.array(gt_det, dtype=np.float32) if len(gt_det) > 0 else \
+               np.zeros((1, 40), dtype=np.float32)
+      meta = {'c': c, 's': s, 'gt_det': gt_det, 'img_id': img_id}
+      ret['meta'] = meta
+    return ret
diff --git a/src/lib/detectors/base_detector.py b/src/lib/detectors/base_detector.py
new file mode 100755
index 0000000..cf74b0e
--- /dev/null
+++ b/src/lib/detectors/base_detector.py
@@ -0,0 +1,150 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import cv2
+import numpy as np
+from progress.bar import Bar
+import time
+import torch
+
+from models.model import create_model, load_model
+from utils.image import get_affine_transform
+from utils.debugger import Debugger
+
+
+class BaseDetector(object):
+    def __init__(self, opt):
+        if opt.gpus[0] >= 0:
+            opt.device = torch.device('cuda')
+        else:
+            opt.device = torch.device('cpu')
+
+        print('Creating model...')
+        self.model = create_model(opt.arch, opt.heads, opt.head_conv)
+        self.model = load_model(self.model, opt.load_model)
+        self.model = self.model.to(opt.device)
+        self.model.eval()
+
+        self.mean = np.array(opt.mean, dtype=np.float32).reshape(1, 1, 3)
+        self.std = np.array(opt.std, dtype=np.float32).reshape(1, 1, 3)
+        self.max_per_image = 100
+        self.num_classes = opt.num_classes
+        self.scales = opt.test_scales
+        self.opt = opt
+        self.pause = True
+
+    def pre_process(self, image, scale, meta=None):
+        height, width = image.shape[0:2]
+        new_height = int(height * scale)
+        new_width = int(width * scale)
+        if self.opt.fix_res:
+            inp_height, inp_width = self.opt.input_h, self.opt.input_w
+            c = np.array([new_width / 2., new_height / 2.], dtype=np.float32)
+            s = max(height, width) * 1.0
+        else:
+            inp_height = (new_height | self.opt.pad) + 1
+            inp_width = (new_width | self.opt.pad) + 1
+            c = np.array([new_width // 2, new_height // 2], dtype=np.float32)
+            s = np.array([inp_width, inp_height], dtype=np.float32)
+
+        trans_input = get_affine_transform(c, s, 0, [inp_width, inp_height])
+        resized_image = cv2.resize(image, (new_width, new_height))
+        inp_image = cv2.warpAffine(
+            resized_image, trans_input, (inp_width, inp_height),
+            flags=cv2.INTER_LINEAR)
+        inp_image = ((inp_image / 255. - self.mean) /
+                     self.std).astype(np.float32)
+
+        images = inp_image.transpose(2, 0, 1).reshape(
+            1, 3, inp_height, inp_width)
+        if self.opt.flip_test:
+            images = np.concatenate((images, images[:, :, :, ::-1]), axis=0)
+        images = torch.from_numpy(images)
+        meta = {'c': c, 's': s,
+                'out_height': inp_height // self.opt.down_ratio,
+                'out_width': inp_width // self.opt.down_ratio}
+        return images, meta
+
+    def process(self, images, return_time=False):
+        raise NotImplementedError
+
+    def post_process(self, dets, meta, scale=1):
+        raise NotImplementedError
+
+    def merge_outputs(self, detections):
+        raise NotImplementedError
+
+    def debug(self, debugger, images, dets, output, scale=1):
+        raise NotImplementedError
+
+    def show_results(self, debugger, image, results):
+        raise NotImplementedError
+
+    def run(self, image_or_path_or_tensor, meta=None):
+        load_time, pre_time, net_time, dec_time, post_time = 0, 0, 0, 0, 0
+        merge_time, tot_time = 0, 0
+        debugger = Debugger(dataset=self.opt.dataset, ipynb=(self.opt.debug == 3),
+                            theme=self.opt.debugger_theme)
+        start_time = time.time()
+        pre_processed = False
+        if isinstance(image_or_path_or_tensor, np.ndarray):
+            image = image_or_path_or_tensor
+        elif type(image_or_path_or_tensor) == type(''):
+            image = cv2.imread(image_or_path_or_tensor)
+        else:
+            image = image_or_path_or_tensor['image'][0].numpy()
+            pre_processed_images = image_or_path_or_tensor
+            pre_processed = True
+
+        loaded_time = time.time()
+        load_time += (loaded_time - start_time)
+
+        detections = []
+        for scale in self.scales:
+            scale_start_time = time.time()
+            if not pre_processed:
+                images, meta = self.pre_process(image, scale, meta)
+            else:
+                # import pdb; pdb.set_trace()
+                images = pre_processed_images['images'][scale][0]
+                meta = pre_processed_images['meta'][scale]
+                meta = {k: v.numpy()[0] for k, v in meta.items()}
+            images = images.to(self.opt.device)
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            pre_process_time = time.time()
+            pre_time += pre_process_time - scale_start_time
+
+            output, dets, forward_time = self.process(images, return_time=True)
+
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            net_time += forward_time - pre_process_time
+            decode_time = time.time()
+            dec_time += decode_time - forward_time
+
+            if self.opt.debug >= 2:
+                self.debug(debugger, images, dets, output, scale)
+
+            dets = self.post_process(dets, meta, scale)
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            post_process_time = time.time()
+            post_time += post_process_time - decode_time
+
+            detections.append(dets)
+
+        results = self.merge_outputs(detections)
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        end_time = time.time()
+        merge_time += end_time - post_process_time
+        tot_time += end_time - start_time
+
+        if self.opt.debug >= 1:
+            self.show_results(debugger, image, results)
+
+        return {'results': results, 'tot': tot_time, 'load': load_time,
+                'pre': pre_time, 'net': net_time, 'dec': dec_time,
+                'post': post_time, 'merge': merge_time}
diff --git a/src/lib/detectors/ctdet.py b/src/lib/detectors/ctdet.py
new file mode 100755
index 0000000..5bf0594
--- /dev/null
+++ b/src/lib/detectors/ctdet.py
@@ -0,0 +1,96 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import cv2
+import numpy as np
+from progress.bar import Bar
+import time
+import torch
+
+try:
+  from external.nms import soft_nms
+except:
+  print('NMS not imported! If you need it,'
+        ' do \n cd $CenterNet_ROOT/src/lib/external \n make')
+from models.decode import ctdet_decode
+from models.utils import flip_tensor
+from utils.image import get_affine_transform
+from utils.post_process import ctdet_post_process
+from utils.debugger import Debugger
+
+from .base_detector import BaseDetector
+
+class CtdetDetector(BaseDetector):
+  def __init__(self, opt):
+    super(CtdetDetector, self).__init__(opt)
+  
+  def process(self, images, return_time=False):
+    with torch.no_grad():
+      output = self.model(images)[-1]
+      hm = output['hm'].sigmoid_()
+      wh = output['wh']
+      reg = output['reg'] if self.opt.reg_offset else None
+      if self.opt.flip_test:
+        hm = (hm[0:1] + flip_tensor(hm[1:2])) / 2
+        wh = (wh[0:1] + flip_tensor(wh[1:2])) / 2
+        reg = reg[0:1] if reg is not None else None
+      torch.cuda.synchronize()
+      forward_time = time.time()
+      dets = ctdet_decode(hm, wh, reg=reg, cat_spec_wh=self.opt.cat_spec_wh, K=self.opt.K)
+      
+    if return_time:
+      return output, dets, forward_time
+    else:
+      return output, dets
+
+  def post_process(self, dets, meta, scale=1):
+    dets = dets.detach().cpu().numpy()
+    dets = dets.reshape(1, -1, dets.shape[2])
+    dets = ctdet_post_process(
+        dets.copy(), [meta['c']], [meta['s']],
+        meta['out_height'], meta['out_width'], self.opt.num_classes)
+    for j in range(1, self.num_classes + 1):
+      dets[0][j] = np.array(dets[0][j], dtype=np.float32).reshape(-1, 5)
+      dets[0][j][:, :4] /= scale
+    return dets[0]
+
+  def merge_outputs(self, detections):
+    results = {}
+    for j in range(1, self.num_classes + 1):
+      results[j] = np.concatenate(
+        [detection[j] for detection in detections], axis=0).astype(np.float32)
+      if len(self.scales) > 1 or self.opt.nms:
+         soft_nms(results[j], Nt=0.5, method=2)
+    scores = np.hstack(
+      [results[j][:, 4] for j in range(1, self.num_classes + 1)])
+    if len(scores) > self.max_per_image:
+      kth = len(scores) - self.max_per_image
+      thresh = np.partition(scores, kth)[kth]
+      for j in range(1, self.num_classes + 1):
+        keep_inds = (results[j][:, 4] >= thresh)
+        results[j] = results[j][keep_inds]
+    return results
+
+  def debug(self, debugger, images, dets, output, scale=1):
+    detection = dets.detach().cpu().numpy().copy()
+    detection[:, :, :4] *= self.opt.down_ratio
+    for i in range(1):
+      img = images[i].detach().cpu().numpy().transpose(1, 2, 0)
+      img = ((img * self.std + self.mean) * 255).astype(np.uint8)
+      pred = debugger.gen_colormap(output['hm'][i].detach().cpu().numpy())
+      debugger.add_blend_img(img, pred, 'pred_hm_{:.1f}'.format(scale))
+      debugger.add_img(img, img_id='out_pred_{:.1f}'.format(scale))
+      for k in range(len(dets[i])):
+        if detection[i, k, 4] > self.opt.center_thresh:
+          debugger.add_coco_bbox(detection[i, k, :4], detection[i, k, -1],
+                                 detection[i, k, 4], 
+                                 img_id='out_pred_{:.1f}'.format(scale))
+
+  def show_results(self, debugger, image, results):
+    debugger.add_img(image, img_id='ctdet')
+    for j in range(1, self.num_classes + 1):
+      for bbox in results[j]:
+        if bbox[4] > self.opt.vis_thresh:
+          debugger.add_coco_bbox(bbox[:4], j - 1, bbox[4], img_id='ctdet')
+    debugger.show_all_imgs(pause=self.pause)
diff --git a/src/lib/detectors/ctseg.py b/src/lib/detectors/ctseg.py
new file mode 100644
index 0000000..c1c8532
--- /dev/null
+++ b/src/lib/detectors/ctseg.py
@@ -0,0 +1,72 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import cv2
+import numpy as np
+from progress.bar import Bar
+import time
+import torch
+from pycocotools import mask as mask_utils
+try:
+    from external.nms import soft_nms
+except:
+    print('NMS not imported! If you need it,'
+          ' do \n cd $CenterNet_ROOT/src/lib/external \n make')
+from models.decode import ctseg_decode
+from models.utils import flip_tensor
+from utils.image import get_affine_transform
+from utils.post_process import ctseg_post_process
+from utils.debugger import Debugger
+
+from .base_detector import BaseDetector
+
+
+class CtsegDetector(BaseDetector):
+    def __init__(self, opt):
+        super(CtsegDetector, self).__init__(opt)
+
+    def process(self, images, return_time=False):
+        with torch.no_grad():
+            output = self.model(images)[-1]
+            hm = output['hm'].sigmoid_()
+            wh = output['wh']
+            seg_feat = output['seg_feat']
+            conv_weigt = output['conv_weight']
+            reg = output['reg'] if self.opt.reg_offset else None
+            assert not self.opt.flip_test, "not support flip_test"
+            torch.cuda.synchronize()
+            forward_time = time.time()
+            dets, masks = ctseg_decode(
+                hm, wh, seg_feat, conv_weigt, reg=reg, cat_spec_wh=self.opt.cat_spec_wh, K=self.opt.K)
+
+        if return_time:
+            return output, (dets, masks), forward_time
+        else:
+            return output, (dets, masks)
+
+    def post_process(self, det_seg, meta, scale=1):
+        assert scale == 1, "not support scale != 1"
+        dets, seg = det_seg
+        dets = dets.detach().cpu().numpy()
+        seg = seg.detach().cpu().numpy()
+        dets = dets.reshape(1, -1, dets.shape[2])
+        dets = ctseg_post_process(
+            dets.copy(), seg.copy(), [meta['c']], [meta['s']],
+            meta['out_height'], meta['out_width'], *meta['img_size'], self.opt.num_classes)
+        return dets[0]
+
+    def merge_outputs(self, detections):
+        return detections[0]
+
+    def show_results(self, debugger, image, results):
+        debugger.add_img(image, img_id='ctseg')
+        for j in range(1, self.num_classes + 1):
+            for i in range(len(results[j]['boxs'])):
+                bbox = results[j]['boxs'][i]
+                mask = mask_utils.decode(results[j]['pred_mask'][i])
+                if bbox[4] > self.opt.vis_thresh:
+                    debugger.add_coco_bbox(
+                        bbox[:4], j - 1, bbox[4], img_id='ctseg')
+                    debugger.add_coco_seg(mask, img_id='ctseg')
+        debugger.show_all_imgs(pause=self.pause)
diff --git a/src/lib/detectors/ddd.py b/src/lib/detectors/ddd.py
new file mode 100755
index 0000000..6222599
--- /dev/null
+++ b/src/lib/detectors/ddd.py
@@ -0,0 +1,106 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import cv2
+import numpy as np
+from progress.bar import Bar
+import time
+import torch
+
+
+from models.decode import ddd_decode
+from models.utils import flip_tensor
+from utils.image import get_affine_transform
+from utils.post_process import ddd_post_process
+from utils.debugger import Debugger
+from utils.ddd_utils import compute_box_3d, project_to_image, alpha2rot_y
+from utils.ddd_utils import draw_box_3d, unproject_2d_to_3d
+
+from .base_detector import BaseDetector
+
+class DddDetector(BaseDetector):
+  def __init__(self, opt):
+    super(DddDetector, self).__init__(opt)
+    self.calib = np.array([[707.0493, 0, 604.0814, 45.75831],
+                           [0, 707.0493, 180.5066, -0.3454157],
+                           [0, 0, 1., 0.004981016]], dtype=np.float32)
+
+
+  def pre_process(self, image, scale, calib=None):
+    height, width = image.shape[0:2]
+    
+    inp_height, inp_width = self.opt.input_h, self.opt.input_w
+    c = np.array([width / 2, height / 2], dtype=np.float32)
+    if self.opt.keep_res:
+      s = np.array([inp_width, inp_height], dtype=np.int32)
+    else:
+      s = np.array([width, height], dtype=np.int32)
+
+    trans_input = get_affine_transform(c, s, 0, [inp_width, inp_height])
+    resized_image = image #cv2.resize(image, (width, height))
+    inp_image = cv2.warpAffine(
+      resized_image, trans_input, (inp_width, inp_height),
+      flags=cv2.INTER_LINEAR)
+    inp_image = (inp_image.astype(np.float32) / 255.)
+    inp_image = (inp_image - self.mean) / self.std
+    images = inp_image.transpose(2, 0, 1)[np.newaxis, ...]
+    calib = np.array(calib, dtype=np.float32) if calib is not None \
+            else self.calib
+    images = torch.from_numpy(images)
+    meta = {'c': c, 's': s, 
+            'out_height': inp_height // self.opt.down_ratio, 
+            'out_width': inp_width // self.opt.down_ratio,
+            'calib': calib}
+    return images, meta
+  
+  def process(self, images, return_time=False):
+    with torch.no_grad():
+      torch.cuda.synchronize()
+      output = self.model(images)[-1]
+      output['hm'] = output['hm'].sigmoid_()
+      output['dep'] = 1. / (output['dep'].sigmoid() + 1e-6) - 1.
+      wh = output['wh'] if self.opt.reg_bbox else None
+      reg = output['reg'] if self.opt.reg_offset else None
+      torch.cuda.synchronize()
+      forward_time = time.time()
+      
+      dets = ddd_decode(output['hm'], output['rot'], output['dep'],
+                          output['dim'], wh=wh, reg=reg, K=self.opt.K)
+    if return_time:
+      return output, dets, forward_time
+    else:
+      return output, dets
+
+  def post_process(self, dets, meta, scale=1):
+    dets = dets.detach().cpu().numpy()
+    detections = ddd_post_process(
+      dets.copy(), [meta['c']], [meta['s']], [meta['calib']], self.opt)
+    self.this_calib = meta['calib']
+    return detections[0]
+
+  def merge_outputs(self, detections):
+    results = detections[0]
+    for j in range(1, self.num_classes + 1):
+      if len(results[j] > 0):
+        keep_inds = (results[j][:, -1] > self.opt.peak_thresh)
+        results[j] = results[j][keep_inds]
+    return results
+
+  def debug(self, debugger, images, dets, output, scale=1):
+    dets = dets.detach().cpu().numpy()
+    img = images[0].detach().cpu().numpy().transpose(1, 2, 0)
+    img = ((img * self.std + self.mean) * 255).astype(np.uint8)
+    pred = debugger.gen_colormap(output['hm'][0].detach().cpu().numpy())
+    debugger.add_blend_img(img, pred, 'pred_hm')
+    debugger.add_ct_detection(
+      img, dets[0], show_box=self.opt.reg_bbox, 
+      center_thresh=self.opt.vis_thresh, img_id='det_pred')
+  
+  def show_results(self, debugger, image, results):
+    debugger.add_3d_detection(
+      image, results, self.this_calib,
+      center_thresh=self.opt.vis_thresh, img_id='add_pred')
+    debugger.add_bird_view(
+      results, center_thresh=self.opt.vis_thresh, img_id='bird_pred')
+    debugger.show_all_imgs(pause=self.pause)
\ No newline at end of file
diff --git a/src/lib/detectors/detector_factory.py b/src/lib/detectors/detector_factory.py
new file mode 100644
index 0000000..c581c8f
--- /dev/null
+++ b/src/lib/detectors/detector_factory.py
@@ -0,0 +1,17 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from .exdet import ExdetDetector
+from .ddd import DddDetector
+from .ctdet import CtdetDetector
+from .ctseg import CtsegDetector
+from .multi_pose import MultiPoseDetector
+
+detector_factory = {
+    'exdet': ExdetDetector,
+    'ddd': DddDetector,
+    'ctdet': CtdetDetector,
+    'ctseg': CtsegDetector,
+    'multi_pose': MultiPoseDetector,
+}
diff --git a/src/lib/detectors/exdet.py b/src/lib/detectors/exdet.py
new file mode 100755
index 0000000..cbf61d4
--- /dev/null
+++ b/src/lib/detectors/exdet.py
@@ -0,0 +1,131 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import _init_paths
+
+import os
+
+import cv2
+import numpy as np
+from progress.bar import Bar
+import time
+import torch
+
+from models.decode import exct_decode, agnex_ct_decode
+from models.utils import flip_tensor
+from utils.image import get_affine_transform, transform_preds
+from utils.post_process import ctdet_post_process
+from utils.debugger import Debugger
+
+from .base_detector import BaseDetector
+
+class ExdetDetector(BaseDetector):
+  def __init__(self, opt):
+    super(ExdetDetector, self).__init__(opt)
+    self.decode = agnex_ct_decode if opt.agnostic_ex else exct_decode
+
+  def process(self, images, return_time=False):
+    with torch.no_grad():
+      torch.cuda.synchronize()
+      output = self.model(images)[-1]
+      t_heat = output['hm_t'].sigmoid_()
+      l_heat = output['hm_l'].sigmoid_()
+      b_heat = output['hm_b'].sigmoid_()
+      r_heat = output['hm_r'].sigmoid_()
+      c_heat = output['hm_c'].sigmoid_()
+      torch.cuda.synchronize()
+      forward_time = time.time()
+      if self.opt.reg_offset:
+        dets = self.decode(t_heat, l_heat, b_heat, r_heat, c_heat, 
+                      output['reg_t'], output['reg_l'],
+                      output['reg_b'], output['reg_r'], 
+                      K=self.opt.K,
+                      scores_thresh=self.opt.scores_thresh,
+                      center_thresh=self.opt.center_thresh,
+                      aggr_weight=self.opt.aggr_weight)
+      else:
+        dets = self.decode(t_heat, l_heat, b_heat, r_heat, c_heat, K=self.opt.K,
+                      scores_thresh=self.opt.scores_thresh,
+                      center_thresh=self.opt.center_thresh,
+                      aggr_weight=self.opt.aggr_weight)
+    if return_time:
+      return output, dets, forward_time
+    else:
+      return output, dets
+
+  def debug(self, debugger, images, dets, output, scale=1):
+    detection = dets.detach().cpu().numpy().copy()
+    detection[:, :, :4] *= self.opt.down_ratio
+    for i in range(1):
+      inp_height, inp_width = images.shape[2], images.shape[3]
+      pred_hm = np.zeros((inp_height, inp_width, 3), dtype=np.uint8)
+      img = images[i].detach().cpu().numpy().transpose(1, 2, 0)
+      img = ((img * self.std + self.mean) * 255).astype(np.uint8)
+      parts = ['t', 'l', 'b', 'r', 'c']
+      for p in parts:
+        tag = 'hm_{}'.format(p)
+        pred = debugger.gen_colormap(
+          output[tag][i].detach().cpu().numpy(), (inp_height, inp_width))
+        if p != 'c':
+          pred_hm = np.maximum(pred_hm, pred)
+        else:
+          debugger.add_blend_img(
+            img, pred, 'pred_{}_{:.1f}'.format(p, scale))
+      debugger.add_blend_img(img, pred_hm, 'pred_{:.1f}'.format(scale))
+      debugger.add_img(img, img_id='out_{:.1f}'.format(scale))
+      for k in range(len(detection[i])):
+        # print('detection', detection[i, k, 4], detection[i, k])
+        if detection[i, k, 4] > 0.01:
+          # print('detection', detection[i, k, 4], detection[i, k])
+          debugger.add_coco_bbox(detection[i, k, :4], detection[i, k, -1],
+                                 detection[i, k, 4], 
+                                 img_id='out_{:.1f}'.format(scale))
+
+  def post_process(self, dets, meta, scale=1):
+    out_width, out_height = meta['out_width'], meta['out_height']
+    dets = dets.detach().cpu().numpy().reshape(2, -1, 14)
+    dets[1, :, [0, 2]] = out_width - dets[1, :, [2, 0]]
+    dets = dets.reshape(1, -1, 14)
+    dets[0, :, 0:2] = transform_preds(
+      dets[0, :, 0:2], meta['c'], meta['s'], (out_width, out_height))
+    dets[0, :, 2:4] = transform_preds(
+      dets[0, :, 2:4], meta['c'], meta['s'], (out_width, out_height))
+    dets[:, :, 0:4] /= scale
+    return dets[0]
+
+  def merge_outputs(self, detections):
+    detections = np.concatenate(
+        [detection for detection in detections], axis=0).astype(np.float32)
+    classes = detections[..., -1]
+    keep_inds = (detections[:, 4] > 0)
+    detections = detections[keep_inds]
+    classes = classes[keep_inds]
+
+    results = {}
+    for j in range(self.num_classes):
+      keep_inds = (classes == j)
+      results[j + 1] = detections[keep_inds][:, 0:7].astype(np.float32)
+      soft_nms(results[j + 1], Nt=0.5, method=2)
+      results[j + 1] = results[j + 1][:, 0:5]
+
+    scores = np.hstack([
+      results[j][:, -1] 
+      for j in range(1, self.num_classes + 1)
+    ])
+    if len(scores) > self.max_per_image:
+      kth = len(scores) - self.max_per_image
+      thresh = np.partition(scores, kth)[kth]
+      for j in range(1, self.num_classes + 1):
+        keep_inds = (results[j][:, -1] >= thresh)
+        results[j] = results[j][keep_inds]
+    return results
+
+
+  def show_results(self, debugger, image, results):
+    debugger.add_img(image, img_id='exdet')
+    for j in range(1, self.num_classes + 1):
+      for bbox in results[j]:
+        if bbox[4] > self.opt.vis_thresh:
+          debugger.add_coco_bbox(bbox[:4], j - 1, bbox[4], img_id='exdet')
+    debugger.show_all_imgs(pause=self.pause)
diff --git a/src/lib/detectors/multi_pose.py b/src/lib/detectors/multi_pose.py
new file mode 100755
index 0000000..e382691
--- /dev/null
+++ b/src/lib/detectors/multi_pose.py
@@ -0,0 +1,109 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import cv2
+import numpy as np
+from progress.bar import Bar
+import time
+import torch
+
+try:
+    from external.nms import soft_nms_39
+except:
+    print('NMS not imported! If you need it,'
+          ' do \n cd $CenterNet_ROOT/src/lib/external \n make')
+from models.decode import multi_pose_decode
+from models.utils import flip_tensor, flip_lr_off, flip_lr
+from utils.image import get_affine_transform
+from utils.post_process import multi_pose_post_process
+from utils.debugger import Debugger
+
+from .base_detector import BaseDetector
+
+
+class MultiPoseDetector(BaseDetector):
+    def __init__(self, opt):
+        super(MultiPoseDetector, self).__init__(opt)
+        self.flip_idx = opt.flip_idx
+
+    def process(self, images, return_time=False):
+        with torch.no_grad():
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            output = self.model(images)[-1]
+            output['hm'] = output['hm'].sigmoid_()
+            if self.opt.hm_hp and not self.opt.mse_loss:
+                output['hm_hp'] = output['hm_hp'].sigmoid_()
+
+            reg = output['reg'] if self.opt.reg_offset else None
+            hm_hp = output['hm_hp'] if self.opt.hm_hp else None
+            hp_offset = output['hp_offset'] if self.opt.reg_hp_offset else None
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            forward_time = time.time()
+
+            if self.opt.flip_test:
+                output['hm'] = (output['hm'][0:1] +
+                                flip_tensor(output['hm'][1:2])) / 2
+                output['wh'] = (output['wh'][0:1] +
+                                flip_tensor(output['wh'][1:2])) / 2
+                output['hps'] = (output['hps'][0:1] +
+                                 flip_lr_off(output['hps'][1:2], self.flip_idx)) / 2
+                hm_hp = (hm_hp[0:1] + flip_lr(hm_hp[1:2], self.flip_idx)) / 2 \
+                    if hm_hp is not None else None
+                reg = reg[0:1] if reg is not None else None
+                hp_offset = hp_offset[0:1] if hp_offset is not None else None
+
+            dets = multi_pose_decode(
+                output['hm'], output['wh'], output['hps'],
+                reg=reg, hm_hp=hm_hp, hp_offset=hp_offset, K=self.opt.K)
+
+        if return_time:
+            return output, dets, forward_time
+        else:
+            return output, dets
+
+    def post_process(self, dets, meta, scale=1):
+        dets = dets.detach().cpu().numpy().reshape(1, -1, dets.shape[2])
+        dets = multi_pose_post_process(
+            dets.copy(), [meta['c']], [meta['s']],
+            meta['out_height'], meta['out_width'])
+        for j in range(1, self.num_classes + 1):
+            dets[0][j] = np.array(dets[0][j], dtype=np.float32).reshape(-1, 39)
+            # import pdb; pdb.set_trace()
+            dets[0][j][:, :4] /= scale
+            dets[0][j][:, 5:] /= scale
+        return dets[0]
+
+    def merge_outputs(self, detections):
+        results = {}
+        results[1] = np.concatenate(
+            [detection[1] for detection in detections], axis=0).astype(np.float32)
+        if self.opt.nms or len(self.opt.test_scales) > 1:
+            soft_nms_39(results[1], Nt=0.5, method=2)
+        results[1] = results[1].tolist()
+        return results
+
+    def debug(self, debugger, images, dets, output, scale=1):
+        dets = dets.detach().cpu().numpy().copy()
+        dets[:, :, :4] *= self.opt.down_ratio
+        dets[:, :, 5:39] *= self.opt.down_ratio
+        img = images[0].detach().cpu().numpy().transpose(1, 2, 0)
+        img = np.clip(((
+            img * self.std + self.mean) * 255.), 0, 255).astype(np.uint8)
+        pred = debugger.gen_colormap(output['hm'][0].detach().cpu().numpy())
+        debugger.add_blend_img(img, pred, 'pred_hm')
+        if self.opt.hm_hp:
+            pred = debugger.gen_colormap_hp(
+                output['hm_hp'][0].detach().cpu().numpy())
+            debugger.add_blend_img(img, pred, 'pred_hmhp')
+
+    def show_results(self, debugger, image, results):
+        debugger.add_img(image, img_id='multi_pose')
+        for bbox in results[1]:
+            if bbox[4] > self.opt.vis_thresh:
+                debugger.add_coco_bbox(
+                    bbox[:4], 0, bbox[4], img_id='multi_pose')
+                debugger.add_coco_hp(bbox[5:39], img_id='multi_pose')
+        debugger.show_all_imgs(pause=self.pause)
diff --git a/src/lib/external/.gitignore b/src/lib/external/.gitignore
new file mode 100644
index 0000000..f7c8c1a
--- /dev/null
+++ b/src/lib/external/.gitignore
@@ -0,0 +1,7 @@
+bbox.c
+bbox.cpython-35m-x86_64-linux-gnu.so
+bbox.cpython-36m-x86_64-linux-gnu.so
+
+nms.c
+nms.cpython-35m-x86_64-linux-gnu.so
+nms.cpython-36m-x86_64-linux-gnu.so
diff --git a/src/lib/external/Makefile b/src/lib/external/Makefile
new file mode 100644
index 0000000..a482398
--- /dev/null
+++ b/src/lib/external/Makefile
@@ -0,0 +1,3 @@
+all:
+	python setup.py build_ext --inplace
+	rm -rf build
diff --git a/src/lib/external/__init__.py b/src/lib/external/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/lib/external/nms.pyx b/src/lib/external/nms.pyx
new file mode 100644
index 0000000..6499102
--- /dev/null
+++ b/src/lib/external/nms.pyx
@@ -0,0 +1,391 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+# ----------------------------------------------------------
+# Soft-NMS: Improving Object Detection With One Line of Code
+# Copyright (c) University of Maryland, College Park
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Navaneeth Bodla and Bharat Singh
+# ----------------------------------------------------------
+
+import numpy as np
+cimport numpy as np
+
+cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
+    return a if a >= b else b
+
+cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
+    return a if a <= b else b
+
+def nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
+    cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
+    cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
+    cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
+    cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
+    cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
+
+    cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
+
+    cdef int ndets = dets.shape[0]
+    cdef np.ndarray[np.int_t, ndim=1] suppressed = \
+            np.zeros((ndets), dtype=np.int)
+
+    # nominal indices
+    cdef int _i, _j
+    # sorted indices
+    cdef int i, j
+    # temp variables for box i's (the box currently under consideration)
+    cdef np.float32_t ix1, iy1, ix2, iy2, iarea
+    # variables for computing overlap with box j (lower scoring box)
+    cdef np.float32_t xx1, yy1, xx2, yy2
+    cdef np.float32_t w, h
+    cdef np.float32_t inter, ovr
+
+    keep = []
+    for _i in range(ndets):
+        i = order[_i]
+        if suppressed[i] == 1:
+            continue
+        keep.append(i)
+        ix1 = x1[i]
+        iy1 = y1[i]
+        ix2 = x2[i]
+        iy2 = y2[i]
+        iarea = areas[i]
+        for _j in range(_i + 1, ndets):
+            j = order[_j]
+            if suppressed[j] == 1:
+                continue
+            xx1 = max(ix1, x1[j])
+            yy1 = max(iy1, y1[j])
+            xx2 = min(ix2, x2[j])
+            yy2 = min(iy2, y2[j])
+            w = max(0.0, xx2 - xx1 + 1)
+            h = max(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (iarea + areas[j] - inter)
+            if ovr >= thresh:
+                suppressed[j] = 1
+
+    return keep
+
+def soft_nms(np.ndarray[float, ndim=2] boxes, float sigma=0.5, float Nt=0.3, float threshold=0.001, unsigned int method=0):
+    cdef unsigned int N = boxes.shape[0]
+    cdef float iw, ih, box_area
+    cdef float ua
+    cdef int pos = 0
+    cdef float maxscore = 0
+    cdef int maxpos = 0
+    cdef float x1,x2,y1,y2,tx1,tx2,ty1,ty2,ts,area,weight,ov
+
+    for i in range(N):
+        maxscore = boxes[i, 4]
+        maxpos = i
+
+        tx1 = boxes[i,0]
+        ty1 = boxes[i,1]
+        tx2 = boxes[i,2]
+        ty2 = boxes[i,3]
+        ts = boxes[i,4]
+
+        pos = i + 1
+        # get max box
+        while pos < N:
+            if maxscore < boxes[pos, 4]:
+                maxscore = boxes[pos, 4]
+                maxpos = pos
+            pos = pos + 1
+
+        # add max box as a detection 
+        boxes[i,0] = boxes[maxpos,0]
+        boxes[i,1] = boxes[maxpos,1]
+        boxes[i,2] = boxes[maxpos,2]
+        boxes[i,3] = boxes[maxpos,3]
+        boxes[i,4] = boxes[maxpos,4]
+
+        # swap ith box with position of max box
+        boxes[maxpos,0] = tx1
+        boxes[maxpos,1] = ty1
+        boxes[maxpos,2] = tx2
+        boxes[maxpos,3] = ty2
+        boxes[maxpos,4] = ts
+
+        tx1 = boxes[i,0]
+        ty1 = boxes[i,1]
+        tx2 = boxes[i,2]
+        ty2 = boxes[i,3]
+        ts = boxes[i,4]
+
+        pos = i + 1
+        # NMS iterations, note that N changes if detection boxes fall below threshold
+        while pos < N:
+            x1 = boxes[pos, 0]
+            y1 = boxes[pos, 1]
+            x2 = boxes[pos, 2]
+            y2 = boxes[pos, 3]
+            s = boxes[pos, 4]
+
+            area = (x2 - x1 + 1) * (y2 - y1 + 1)
+            iw = (min(tx2, x2) - max(tx1, x1) + 1)
+            if iw > 0:
+                ih = (min(ty2, y2) - max(ty1, y1) + 1)
+                if ih > 0:
+                    ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih)
+                    ov = iw * ih / ua #iou between max box and detection box
+
+                    if method == 1: # linear
+                        if ov > Nt: 
+                            weight = 1 - ov
+                        else:
+                            weight = 1
+                    elif method == 2: # gaussian
+                        weight = np.exp(-(ov * ov)/sigma)
+                    else: # original NMS
+                        if ov > Nt: 
+                            weight = 0
+                        else:
+                            weight = 1
+
+                    boxes[pos, 4] = weight*boxes[pos, 4]
+                                
+                    # if box score falls below threshold, discard the box by swapping with last box
+                    # update N
+                    if boxes[pos, 4] < threshold:
+                        boxes[pos,0] = boxes[N-1, 0]
+                        boxes[pos,1] = boxes[N-1, 1]
+                        boxes[pos,2] = boxes[N-1, 2]
+                        boxes[pos,3] = boxes[N-1, 3]
+                        boxes[pos,4] = boxes[N-1, 4]
+                        N = N - 1
+                        pos = pos - 1
+
+            pos = pos + 1
+
+    keep = [i for i in range(N)]
+    return keep
+
+def soft_nms_39(np.ndarray[float, ndim=2] boxes, float sigma=0.5, float Nt=0.3, float threshold=0.001, unsigned int method=0):
+    cdef unsigned int N = boxes.shape[0]
+    cdef float iw, ih, box_area
+    cdef float ua
+    cdef int pos = 0
+    cdef float maxscore = 0
+    cdef int maxpos = 0
+    cdef float x1,x2,y1,y2,tx1,tx2,ty1,ty2,ts,area,weight,ov
+    cdef float tmp
+
+    for i in range(N):
+        maxscore = boxes[i, 4]
+        maxpos = i
+
+        tx1 = boxes[i,0]
+        ty1 = boxes[i,1]
+        tx2 = boxes[i,2]
+        ty2 = boxes[i,3]
+        ts = boxes[i,4]
+
+        pos = i + 1
+        # get max box
+        while pos < N:
+            if maxscore < boxes[pos, 4]:
+                maxscore = boxes[pos, 4]
+                maxpos = pos
+            pos = pos + 1
+
+        # add max box as a detection 
+        boxes[i,0] = boxes[maxpos,0]
+        boxes[i,1] = boxes[maxpos,1]
+        boxes[i,2] = boxes[maxpos,2]
+        boxes[i,3] = boxes[maxpos,3]
+        boxes[i,4] = boxes[maxpos,4]
+
+        # swap ith box with position of max box
+        boxes[maxpos,0] = tx1
+        boxes[maxpos,1] = ty1
+        boxes[maxpos,2] = tx2
+        boxes[maxpos,3] = ty2
+        boxes[maxpos,4] = ts
+
+        for j in range(5, 39):
+            tmp = boxes[i, j]
+            boxes[i, j] = boxes[maxpos, j]
+            boxes[maxpos, j] = tmp
+
+        tx1 = boxes[i,0]
+        ty1 = boxes[i,1]
+        tx2 = boxes[i,2]
+        ty2 = boxes[i,3]
+        ts = boxes[i,4]
+
+        pos = i + 1
+        # NMS iterations, note that N changes if detection boxes fall below threshold
+        while pos < N:
+            x1 = boxes[pos, 0]
+            y1 = boxes[pos, 1]
+            x2 = boxes[pos, 2]
+            y2 = boxes[pos, 3]
+            s = boxes[pos, 4]
+
+            area = (x2 - x1 + 1) * (y2 - y1 + 1)
+            iw = (min(tx2, x2) - max(tx1, x1) + 1)
+            if iw > 0:
+                ih = (min(ty2, y2) - max(ty1, y1) + 1)
+                if ih > 0:
+                    ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih)
+                    ov = iw * ih / ua #iou between max box and detection box
+
+                    if method == 1: # linear
+                        if ov > Nt: 
+                            weight = 1 - ov
+                        else:
+                            weight = 1
+                    elif method == 2: # gaussian
+                        weight = np.exp(-(ov * ov)/sigma)
+                    else: # original NMS
+                        if ov > Nt: 
+                            weight = 0
+                        else:
+                            weight = 1
+
+                    boxes[pos, 4] = weight*boxes[pos, 4]
+                                
+                    # if box score falls below threshold, discard the box by swapping with last box
+                    # update N
+                    if boxes[pos, 4] < threshold:
+                        boxes[pos,0] = boxes[N-1, 0]
+                        boxes[pos,1] = boxes[N-1, 1]
+                        boxes[pos,2] = boxes[N-1, 2]
+                        boxes[pos,3] = boxes[N-1, 3]
+                        boxes[pos,4] = boxes[N-1, 4]
+                        for j in range(5, 39):
+                            tmp = boxes[pos, j]
+                            boxes[pos, j] = boxes[N - 1, j]
+                            boxes[N - 1, j] = tmp
+                        N = N - 1
+                        pos = pos - 1
+
+            pos = pos + 1
+
+    keep = [i for i in range(N)]
+    return keep
+
+def soft_nms_merge(np.ndarray[float, ndim=2] boxes, float sigma=0.5, float Nt=0.3, float threshold=0.001, unsigned int method=0, float weight_exp=6):
+    cdef unsigned int N = boxes.shape[0]
+    cdef float iw, ih, box_area
+    cdef float ua
+    cdef int pos = 0
+    cdef float maxscore = 0
+    cdef int maxpos = 0
+    cdef float x1,x2,y1,y2,tx1,tx2,ty1,ty2,ts,area,weight,ov
+    cdef float mx1,mx2,my1,my2,mts,mbs,mw
+
+    for i in range(N):
+        maxscore = boxes[i, 4]
+        maxpos = i
+
+        tx1 = boxes[i,0]
+        ty1 = boxes[i,1]
+        tx2 = boxes[i,2]
+        ty2 = boxes[i,3]
+        ts = boxes[i,4]
+
+        pos = i + 1
+        # get max box
+        while pos < N:
+            if maxscore < boxes[pos, 4]:
+                maxscore = boxes[pos, 4]
+                maxpos = pos
+            pos = pos + 1
+
+        # add max box as a detection 
+        boxes[i,0] = boxes[maxpos,0]
+        boxes[i,1] = boxes[maxpos,1]
+        boxes[i,2] = boxes[maxpos,2]
+        boxes[i,3] = boxes[maxpos,3]
+        boxes[i,4] = boxes[maxpos,4]
+
+        mx1 = boxes[i, 0] * boxes[i, 5]
+        my1 = boxes[i, 1] * boxes[i, 5]
+        mx2 = boxes[i, 2] * boxes[i, 6]
+        my2 = boxes[i, 3] * boxes[i, 6]
+        mts = boxes[i, 5]
+        mbs = boxes[i, 6]
+
+        # swap ith box with position of max box
+        boxes[maxpos,0] = tx1
+        boxes[maxpos,1] = ty1
+        boxes[maxpos,2] = tx2
+        boxes[maxpos,3] = ty2
+        boxes[maxpos,4] = ts
+
+        tx1 = boxes[i,0]
+        ty1 = boxes[i,1]
+        tx2 = boxes[i,2]
+        ty2 = boxes[i,3]
+        ts = boxes[i,4]
+
+        pos = i + 1
+        # NMS iterations, note that N changes if detection boxes fall below threshold
+        while pos < N:
+            x1 = boxes[pos, 0]
+            y1 = boxes[pos, 1]
+            x2 = boxes[pos, 2]
+            y2 = boxes[pos, 3]
+            s = boxes[pos, 4]
+
+            area = (x2 - x1 + 1) * (y2 - y1 + 1)
+            iw = (min(tx2, x2) - max(tx1, x1) + 1)
+            if iw > 0:
+                ih = (min(ty2, y2) - max(ty1, y1) + 1)
+                if ih > 0:
+                    ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih)
+                    ov = iw * ih / ua #iou between max box and detection box
+
+                    if method == 1: # linear
+                        if ov > Nt: 
+                            weight = 1 - ov
+                        else:
+                            weight = 1
+                    elif method == 2: # gaussian
+                        weight = np.exp(-(ov * ov)/sigma)
+                    else: # original NMS
+                        if ov > Nt: 
+                            weight = 0
+                        else:
+                            weight = 1
+
+                    mw  = (1 - weight) ** weight_exp
+                    mx1 = mx1 + boxes[pos, 0] * boxes[pos, 5] * mw
+                    my1 = my1 + boxes[pos, 1] * boxes[pos, 5] * mw
+                    mx2 = mx2 + boxes[pos, 2] * boxes[pos, 6] * mw
+                    my2 = my2 + boxes[pos, 3] * boxes[pos, 6] * mw
+                    mts = mts + boxes[pos, 5] * mw
+                    mbs = mbs + boxes[pos, 6] * mw
+
+                    boxes[pos, 4] = weight*boxes[pos, 4]
+                                
+                    # if box score falls below threshold, discard the box by swapping with last box
+                    # update N
+                    if boxes[pos, 4] < threshold:
+                        boxes[pos,0] = boxes[N-1, 0]
+                        boxes[pos,1] = boxes[N-1, 1]
+                        boxes[pos,2] = boxes[N-1, 2]
+                        boxes[pos,3] = boxes[N-1, 3]
+                        boxes[pos,4] = boxes[N-1, 4]
+                        N = N - 1
+                        pos = pos - 1
+
+            pos = pos + 1
+
+        boxes[i, 0] = mx1 / mts
+        boxes[i, 1] = my1 / mts
+        boxes[i, 2] = mx2 / mbs
+        boxes[i, 3] = my2 / mbs
+
+    keep = [i for i in range(N)]
+    return keep
diff --git a/src/lib/external/setup.py b/src/lib/external/setup.py
new file mode 100644
index 0000000..c4d2571
--- /dev/null
+++ b/src/lib/external/setup.py
@@ -0,0 +1,18 @@
+import numpy
+from distutils.core import setup
+from distutils.extension import Extension
+from Cython.Build import cythonize
+
+extensions = [
+    Extension(
+        "nms", 
+        ["nms.pyx"],
+        extra_compile_args=["-Wno-cpp", "-Wno-unused-function"]
+    )
+]
+
+setup(
+    name="coco",
+    ext_modules=cythonize(extensions),
+    include_dirs=[numpy.get_include()]
+)
diff --git a/src/lib/logger.py b/src/lib/logger.py
new file mode 100644
index 0000000..eac9a14
--- /dev/null
+++ b/src/lib/logger.py
@@ -0,0 +1,72 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Code referenced from https://gist.github.com/gyglim/1f8dfb1b5c82627ae3efcfbbadb9f514
+import os
+import time
+import sys
+import torch
+USE_TENSORBOARD = True
+try:
+  import tensorboardX
+  print('Using tensorboardX')
+except:
+  USE_TENSORBOARD = False
+
+class Logger(object):
+  def __init__(self, opt):
+    """Create a summary writer logging to log_dir."""
+    if not os.path.exists(opt.save_dir):
+      os.makedirs(opt.save_dir)
+    if not os.path.exists(opt.debug_dir):
+      os.makedirs(opt.debug_dir)
+   
+    time_str = time.strftime('%Y-%m-%d-%H-%M')
+
+    args = dict((name, getattr(opt, name)) for name in dir(opt)
+                if not name.startswith('_'))
+    file_name = os.path.join(opt.save_dir, 'opt.txt')
+    with open(file_name, 'wt') as opt_file:
+      opt_file.write('==> torch version: {}\n'.format(torch.__version__))
+      opt_file.write('==> cudnn version: {}\n'.format(
+        torch.backends.cudnn.version()))
+      opt_file.write('==> Cmd:\n')
+      opt_file.write(str(sys.argv))
+      opt_file.write('\n==> Opt:\n')
+      for k, v in sorted(args.items()):
+        opt_file.write('  %s: %s\n' % (str(k), str(v)))
+          
+    log_dir = opt.save_dir + '/logs_{}'.format(time_str)
+    if USE_TENSORBOARD:
+      self.writer = tensorboardX.SummaryWriter(log_dir=log_dir)
+    else:
+      if not os.path.exists(os.path.dirname(log_dir)):
+        os.mkdir(os.path.dirname(log_dir))
+      if not os.path.exists(log_dir):
+        os.mkdir(log_dir)
+    self.log = open(log_dir + '/log.txt', 'w')
+    try:
+      os.system('cp {}/opt.txt {}/'.format(opt.save_dir, log_dir))
+    except:
+      pass
+    self.start_line = True
+
+  def write(self, txt):
+    if self.start_line:
+      time_str = time.strftime('%Y-%m-%d-%H-%M')
+      self.log.write('{}: {}'.format(time_str, txt))
+    else:
+      self.log.write(txt)  
+    self.start_line = False
+    if '\n' in txt:
+      self.start_line = True
+      self.log.flush()
+  
+  def close(self):
+    self.log.close()
+  
+  def scalar_summary(self, tag, value, step):
+    """Log a scalar variable."""
+    if USE_TENSORBOARD:
+      self.writer.add_scalar(tag, value, step)
diff --git a/src/lib/models/data_parallel.py b/src/lib/models/data_parallel.py
new file mode 100644
index 0000000..1a96c0d
--- /dev/null
+++ b/src/lib/models/data_parallel.py
@@ -0,0 +1,128 @@
+import torch
+from torch.nn.modules import Module
+from torch.nn.parallel.scatter_gather import gather
+from torch.nn.parallel.replicate import replicate
+from torch.nn.parallel.parallel_apply import parallel_apply
+
+
+from .scatter_gather import scatter_kwargs
+
+class _DataParallel(Module):
+    r"""Implements data parallelism at the module level.
+
+    This container parallelizes the application of the given module by
+    splitting the input across the specified devices by chunking in the batch
+    dimension. In the forward pass, the module is replicated on each device,
+    and each replica handles a portion of the input. During the backwards
+    pass, gradients from each replica are summed into the original module.
+
+    The batch size should be larger than the number of GPUs used. It should
+    also be an integer multiple of the number of GPUs so that each chunk is the
+    same size (so that each GPU processes the same number of samples).
+
+    See also: :ref:`cuda-nn-dataparallel-instead`
+
+    Arbitrary positional and keyword inputs are allowed to be passed into
+    DataParallel EXCEPT Tensors. All variables will be scattered on dim
+    specified (default 0). Primitive types will be broadcasted, but all
+    other types will be a shallow copy and can be corrupted if written to in
+    the model's forward pass.
+
+    Args:
+        module: module to be parallelized
+        device_ids: CUDA devices (default: all devices)
+        output_device: device location of output (default: device_ids[0])
+
+    Example::
+
+        >>> net = torch.nn.DataParallel(model, device_ids=[0, 1, 2])
+        >>> output = net(input_var)
+    """
+
+    # TODO: update notes/cuda.rst when this class handles 8+ GPUs well
+
+    def __init__(self, module, device_ids=None, output_device=None, dim=0, chunk_sizes=None):
+        super(_DataParallel, self).__init__()
+
+        if not torch.cuda.is_available():
+            self.module = module
+            self.device_ids = []
+            return
+
+        if device_ids is None:
+            device_ids = list(range(torch.cuda.device_count()))
+        if output_device is None:
+            output_device = device_ids[0]
+        self.dim = dim
+        self.module = module
+        self.device_ids = device_ids
+        self.chunk_sizes = chunk_sizes
+        self.output_device = output_device
+        if len(self.device_ids) == 1:
+            self.module.cuda(device_ids[0])
+
+    def forward(self, *inputs, **kwargs):
+        if not self.device_ids:
+            return self.module(*inputs, **kwargs)
+        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids, self.chunk_sizes)
+        if len(self.device_ids) == 1:
+            return self.module(*inputs[0], **kwargs[0])
+        replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
+        outputs = self.parallel_apply(replicas, inputs, kwargs)
+        return self.gather(outputs, self.output_device)
+
+    def replicate(self, module, device_ids):
+        return replicate(module, device_ids)
+
+    def scatter(self, inputs, kwargs, device_ids, chunk_sizes):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim, chunk_sizes=self.chunk_sizes)
+
+    def parallel_apply(self, replicas, inputs, kwargs):
+        return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
+
+    def gather(self, outputs, output_device):
+        return gather(outputs, output_device, dim=self.dim)
+
+
+def data_parallel(module, inputs, device_ids=None, output_device=None, dim=0, module_kwargs=None):
+    r"""Evaluates module(input) in parallel across the GPUs given in device_ids.
+
+    This is the functional version of the DataParallel module.
+
+    Args:
+        module: the module to evaluate in parallel
+        inputs: inputs to the module
+        device_ids: GPU ids on which to replicate module
+        output_device: GPU location of the output  Use -1 to indicate the CPU.
+            (default: device_ids[0])
+    Returns:
+        a Variable containing the result of module(input) located on
+        output_device
+    """
+    if not isinstance(inputs, tuple):
+        inputs = (inputs,)
+
+    if device_ids is None:
+        device_ids = list(range(torch.cuda.device_count()))
+
+    if output_device is None:
+        output_device = device_ids[0]
+
+    inputs, module_kwargs = scatter_kwargs(inputs, module_kwargs, device_ids, dim)
+    if len(device_ids) == 1:
+        return module(*inputs[0], **module_kwargs[0])
+    used_device_ids = device_ids[:len(inputs)]
+    replicas = replicate(module, used_device_ids)
+    outputs = parallel_apply(replicas, inputs, module_kwargs, used_device_ids)
+    return gather(outputs, output_device, dim)
+
+def DataParallel(module, device_ids=None, output_device=None, dim=0, chunk_sizes=None):
+    if chunk_sizes is None:
+        return torch.nn.DataParallel(module, device_ids, output_device, dim)
+    standard_size = True
+    for i in range(1, len(chunk_sizes)):
+        if chunk_sizes[i] != chunk_sizes[0]:
+            standard_size = False
+    if standard_size:
+        return torch.nn.DataParallel(module, device_ids, output_device, dim)
+    return _DataParallel(module, device_ids, output_device, dim, chunk_sizes)
\ No newline at end of file
diff --git a/src/lib/models/decode.py b/src/lib/models/decode.py
new file mode 100644
index 0000000..54d3209
--- /dev/null
+++ b/src/lib/models/decode.py
@@ -0,0 +1,652 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import torch.nn as nn
+from .utils import _gather_feat, _transpose_and_gather_feat
+
+
+def _nms(heat, kernel=3):
+    pad = (kernel - 1) // 2
+
+    hmax = nn.functional.max_pool2d(
+        heat, (kernel, kernel), stride=1, padding=pad)
+    keep = (hmax == heat).float()
+    return heat * keep
+
+
+def _left_aggregate(heat):
+    '''
+        heat: batchsize x channels x h x w
+    '''
+    shape = heat.shape
+    heat = heat.reshape(-1, heat.shape[3])
+    heat = heat.transpose(1, 0).contiguous()
+    ret = heat.clone()
+    for i in range(1, heat.shape[0]):
+        inds = (heat[i] >= heat[i - 1])
+        ret[i] += ret[i - 1] * inds.float()
+    return (ret - heat).transpose(1, 0).reshape(shape)
+
+
+def _right_aggregate(heat):
+    '''
+        heat: batchsize x channels x h x w
+    '''
+    shape = heat.shape
+    heat = heat.reshape(-1, heat.shape[3])
+    heat = heat.transpose(1, 0).contiguous()
+    ret = heat.clone()
+    for i in range(heat.shape[0] - 2, -1, -1):
+        inds = (heat[i] >= heat[i + 1])
+        ret[i] += ret[i + 1] * inds.float()
+    return (ret - heat).transpose(1, 0).reshape(shape)
+
+
+def _top_aggregate(heat):
+    '''
+        heat: batchsize x channels x h x w
+    '''
+    heat = heat.transpose(3, 2)
+    shape = heat.shape
+    heat = heat.reshape(-1, heat.shape[3])
+    heat = heat.transpose(1, 0).contiguous()
+    ret = heat.clone()
+    for i in range(1, heat.shape[0]):
+        inds = (heat[i] >= heat[i - 1])
+        ret[i] += ret[i - 1] * inds.float()
+    return (ret - heat).transpose(1, 0).reshape(shape).transpose(3, 2)
+
+
+def _bottom_aggregate(heat):
+    '''
+        heat: batchsize x channels x h x w
+    '''
+    heat = heat.transpose(3, 2)
+    shape = heat.shape
+    heat = heat.reshape(-1, heat.shape[3])
+    heat = heat.transpose(1, 0).contiguous()
+    ret = heat.clone()
+    for i in range(heat.shape[0] - 2, -1, -1):
+        inds = (heat[i] >= heat[i + 1])
+        ret[i] += ret[i + 1] * inds.float()
+    return (ret - heat).transpose(1, 0).reshape(shape).transpose(3, 2)
+
+
+def _h_aggregate(heat, aggr_weight=0.1):
+    return aggr_weight * _left_aggregate(heat) + \
+        aggr_weight * _right_aggregate(heat) + heat
+
+
+def _v_aggregate(heat, aggr_weight=0.1):
+    return aggr_weight * _top_aggregate(heat) + \
+        aggr_weight * _bottom_aggregate(heat) + heat
+
+
+'''
+# Slow for large number of categories
+def _topk(scores, K=40):
+    batch, cat, height, width = scores.size()
+    topk_scores, topk_inds = torch.topk(scores.view(batch, -1), K)
+
+    topk_clses = (topk_inds / (height * width)).int()
+
+    topk_inds = topk_inds % (height * width)
+    topk_ys   = (topk_inds / width).int().float()
+    topk_xs   = (topk_inds % width).int().float()
+    return topk_scores, topk_inds, topk_clses, topk_ys, topk_xs
+'''
+
+
+def _topk_channel(scores, K=40):
+    batch, cat, height, width = scores.size()
+
+    topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K)
+
+    topk_inds = topk_inds % (height * width)
+    topk_ys = (topk_inds / width).int().float()
+    topk_xs = (topk_inds % width).int().float()
+
+    return topk_scores, topk_inds, topk_ys, topk_xs
+
+
+def _topk(scores, K=40):
+    batch, cat, height, width = scores.size()
+
+    topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K)
+
+    topk_inds = topk_inds % (height * width)
+    topk_ys = (topk_inds / width).int().float()
+    topk_xs = (topk_inds % width).int().float()
+
+    topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K)
+    topk_clses = (topk_ind / K).int()
+    topk_inds = _gather_feat(
+        topk_inds.view(batch, -1, 1), topk_ind).view(batch, K)
+    topk_ys = _gather_feat(topk_ys.view(batch, -1, 1), topk_ind).view(batch, K)
+    topk_xs = _gather_feat(topk_xs.view(batch, -1, 1), topk_ind).view(batch, K)
+
+    return topk_score, topk_inds, topk_clses, topk_ys, topk_xs
+
+
+def agnex_ct_decode(
+    t_heat, l_heat, b_heat, r_heat, ct_heat,
+    t_regr=None, l_regr=None, b_regr=None, r_regr=None,
+    K=40, scores_thresh=0.1, center_thresh=0.1, aggr_weight=0.0, num_dets=1000
+):
+    batch, cat, height, width = t_heat.size()
+
+    '''
+    t_heat  = torch.sigmoid(t_heat)
+    l_heat  = torch.sigmoid(l_heat)
+    b_heat  = torch.sigmoid(b_heat)
+    r_heat  = torch.sigmoid(r_heat)
+    ct_heat = torch.sigmoid(ct_heat)
+    '''
+    if aggr_weight > 0:
+        t_heat = _h_aggregate(t_heat, aggr_weight=aggr_weight)
+        l_heat = _v_aggregate(l_heat, aggr_weight=aggr_weight)
+        b_heat = _h_aggregate(b_heat, aggr_weight=aggr_weight)
+        r_heat = _v_aggregate(r_heat, aggr_weight=aggr_weight)
+
+    # perform nms on heatmaps
+    t_heat = _nms(t_heat)
+    l_heat = _nms(l_heat)
+    b_heat = _nms(b_heat)
+    r_heat = _nms(r_heat)
+
+    t_heat[t_heat > 1] = 1
+    l_heat[l_heat > 1] = 1
+    b_heat[b_heat > 1] = 1
+    r_heat[r_heat > 1] = 1
+
+    t_scores, t_inds, _, t_ys, t_xs = _topk(t_heat, K=K)
+    l_scores, l_inds, _, l_ys, l_xs = _topk(l_heat, K=K)
+    b_scores, b_inds, _, b_ys, b_xs = _topk(b_heat, K=K)
+    r_scores, r_inds, _, r_ys, r_xs = _topk(r_heat, K=K)
+
+    ct_heat_agn, ct_clses = torch.max(ct_heat, dim=1, keepdim=True)
+
+    # import pdb; pdb.set_trace()
+
+    t_ys = t_ys.view(batch, K, 1, 1, 1).expand(batch, K, K, K, K)
+    t_xs = t_xs.view(batch, K, 1, 1, 1).expand(batch, K, K, K, K)
+    l_ys = l_ys.view(batch, 1, K, 1, 1).expand(batch, K, K, K, K)
+    l_xs = l_xs.view(batch, 1, K, 1, 1).expand(batch, K, K, K, K)
+    b_ys = b_ys.view(batch, 1, 1, K, 1).expand(batch, K, K, K, K)
+    b_xs = b_xs.view(batch, 1, 1, K, 1).expand(batch, K, K, K, K)
+    r_ys = r_ys.view(batch, 1, 1, 1, K).expand(batch, K, K, K, K)
+    r_xs = r_xs.view(batch, 1, 1, 1, K).expand(batch, K, K, K, K)
+
+    box_ct_xs = ((l_xs + r_xs + 0.5) / 2).long()
+    box_ct_ys = ((t_ys + b_ys + 0.5) / 2).long()
+
+    ct_inds = box_ct_ys * width + box_ct_xs
+    ct_inds = ct_inds.view(batch, -1)
+    ct_heat_agn = ct_heat_agn.view(batch, -1, 1)
+    ct_clses = ct_clses.view(batch, -1, 1)
+    ct_scores = _gather_feat(ct_heat_agn, ct_inds)
+    clses = _gather_feat(ct_clses, ct_inds)
+
+    t_scores = t_scores.view(batch, K, 1, 1, 1).expand(batch, K, K, K, K)
+    l_scores = l_scores.view(batch, 1, K, 1, 1).expand(batch, K, K, K, K)
+    b_scores = b_scores.view(batch, 1, 1, K, 1).expand(batch, K, K, K, K)
+    r_scores = r_scores.view(batch, 1, 1, 1, K).expand(batch, K, K, K, K)
+    ct_scores = ct_scores.view(batch, K, K, K, K)
+    scores = (t_scores + l_scores + b_scores + r_scores + 2 * ct_scores) / 6
+
+    # reject boxes based on classes
+    top_inds = (t_ys > l_ys) + (t_ys > b_ys) + (t_ys > r_ys)
+    top_inds = (top_inds > 0)
+    left_inds = (l_xs > t_xs) + (l_xs > b_xs) + (l_xs > r_xs)
+    left_inds = (left_inds > 0)
+    bottom_inds = (b_ys < t_ys) + (b_ys < l_ys) + (b_ys < r_ys)
+    bottom_inds = (bottom_inds > 0)
+    right_inds = (r_xs < t_xs) + (r_xs < l_xs) + (r_xs < b_xs)
+    right_inds = (right_inds > 0)
+
+    sc_inds = (t_scores < scores_thresh) + (l_scores < scores_thresh) + \
+              (b_scores < scores_thresh) + (r_scores < scores_thresh) + \
+              (ct_scores < center_thresh)
+    sc_inds = (sc_inds > 0)
+
+    scores = scores - sc_inds.float()
+    scores = scores - top_inds.float()
+    scores = scores - left_inds.float()
+    scores = scores - bottom_inds.float()
+    scores = scores - right_inds.float()
+
+    scores = scores.view(batch, -1)
+    scores, inds = torch.topk(scores, num_dets)
+    scores = scores.unsqueeze(2)
+
+    if t_regr is not None and l_regr is not None \
+            and b_regr is not None and r_regr is not None:
+        t_regr = _transpose_and_gather_feat(t_regr, t_inds)
+        t_regr = t_regr.view(batch, K, 1, 1, 1, 2)
+        l_regr = _transpose_and_gather_feat(l_regr, l_inds)
+        l_regr = l_regr.view(batch, 1, K, 1, 1, 2)
+        b_regr = _transpose_and_gather_feat(b_regr, b_inds)
+        b_regr = b_regr.view(batch, 1, 1, K, 1, 2)
+        r_regr = _transpose_and_gather_feat(r_regr, r_inds)
+        r_regr = r_regr.view(batch, 1, 1, 1, K, 2)
+
+        t_xs = t_xs + t_regr[..., 0]
+        t_ys = t_ys + t_regr[..., 1]
+        l_xs = l_xs + l_regr[..., 0]
+        l_ys = l_ys + l_regr[..., 1]
+        b_xs = b_xs + b_regr[..., 0]
+        b_ys = b_ys + b_regr[..., 1]
+        r_xs = r_xs + r_regr[..., 0]
+        r_ys = r_ys + r_regr[..., 1]
+    else:
+        t_xs = t_xs + 0.5
+        t_ys = t_ys + 0.5
+        l_xs = l_xs + 0.5
+        l_ys = l_ys + 0.5
+        b_xs = b_xs + 0.5
+        b_ys = b_ys + 0.5
+        r_xs = r_xs + 0.5
+        r_ys = r_ys + 0.5
+
+    bboxes = torch.stack((l_xs, t_ys, r_xs, b_ys), dim=5)
+    bboxes = bboxes.view(batch, -1, 4)
+    bboxes = _gather_feat(bboxes, inds)
+
+    clses = clses.contiguous().view(batch, -1, 1)
+    clses = _gather_feat(clses, inds).float()
+
+    t_xs = t_xs.contiguous().view(batch, -1, 1)
+    t_xs = _gather_feat(t_xs, inds).float()
+    t_ys = t_ys.contiguous().view(batch, -1, 1)
+    t_ys = _gather_feat(t_ys, inds).float()
+    l_xs = l_xs.contiguous().view(batch, -1, 1)
+    l_xs = _gather_feat(l_xs, inds).float()
+    l_ys = l_ys.contiguous().view(batch, -1, 1)
+    l_ys = _gather_feat(l_ys, inds).float()
+    b_xs = b_xs.contiguous().view(batch, -1, 1)
+    b_xs = _gather_feat(b_xs, inds).float()
+    b_ys = b_ys.contiguous().view(batch, -1, 1)
+    b_ys = _gather_feat(b_ys, inds).float()
+    r_xs = r_xs.contiguous().view(batch, -1, 1)
+    r_xs = _gather_feat(r_xs, inds).float()
+    r_ys = r_ys.contiguous().view(batch, -1, 1)
+    r_ys = _gather_feat(r_ys, inds).float()
+    detections = torch.cat([bboxes, scores, t_xs, t_ys, l_xs, l_ys,
+                            b_xs, b_ys, r_xs, r_ys, clses], dim=2)
+
+    return detections
+
+
+def exct_decode(
+    t_heat, l_heat, b_heat, r_heat, ct_heat,
+    t_regr=None, l_regr=None, b_regr=None, r_regr=None,
+    K=40, scores_thresh=0.1, center_thresh=0.1, aggr_weight=0.0, num_dets=1000
+):
+    batch, cat, height, width = t_heat.size()
+    '''
+    t_heat  = torch.sigmoid(t_heat)
+    l_heat  = torch.sigmoid(l_heat)
+    b_heat  = torch.sigmoid(b_heat)
+    r_heat  = torch.sigmoid(r_heat)
+    ct_heat = torch.sigmoid(ct_heat)
+    '''
+
+    if aggr_weight > 0:
+        t_heat = _h_aggregate(t_heat, aggr_weight=aggr_weight)
+        l_heat = _v_aggregate(l_heat, aggr_weight=aggr_weight)
+        b_heat = _h_aggregate(b_heat, aggr_weight=aggr_weight)
+        r_heat = _v_aggregate(r_heat, aggr_weight=aggr_weight)
+
+    # perform nms on heatmaps
+    t_heat = _nms(t_heat)
+    l_heat = _nms(l_heat)
+    b_heat = _nms(b_heat)
+    r_heat = _nms(r_heat)
+
+    t_heat[t_heat > 1] = 1
+    l_heat[l_heat > 1] = 1
+    b_heat[b_heat > 1] = 1
+    r_heat[r_heat > 1] = 1
+
+    t_scores, t_inds, t_clses, t_ys, t_xs = _topk(t_heat, K=K)
+    l_scores, l_inds, l_clses, l_ys, l_xs = _topk(l_heat, K=K)
+    b_scores, b_inds, b_clses, b_ys, b_xs = _topk(b_heat, K=K)
+    r_scores, r_inds, r_clses, r_ys, r_xs = _topk(r_heat, K=K)
+
+    t_ys = t_ys.view(batch, K, 1, 1, 1).expand(batch, K, K, K, K)
+    t_xs = t_xs.view(batch, K, 1, 1, 1).expand(batch, K, K, K, K)
+    l_ys = l_ys.view(batch, 1, K, 1, 1).expand(batch, K, K, K, K)
+    l_xs = l_xs.view(batch, 1, K, 1, 1).expand(batch, K, K, K, K)
+    b_ys = b_ys.view(batch, 1, 1, K, 1).expand(batch, K, K, K, K)
+    b_xs = b_xs.view(batch, 1, 1, K, 1).expand(batch, K, K, K, K)
+    r_ys = r_ys.view(batch, 1, 1, 1, K).expand(batch, K, K, K, K)
+    r_xs = r_xs.view(batch, 1, 1, 1, K).expand(batch, K, K, K, K)
+
+    t_clses = t_clses.view(batch, K, 1, 1, 1).expand(batch, K, K, K, K)
+    l_clses = l_clses.view(batch, 1, K, 1, 1).expand(batch, K, K, K, K)
+    b_clses = b_clses.view(batch, 1, 1, K, 1).expand(batch, K, K, K, K)
+    r_clses = r_clses.view(batch, 1, 1, 1, K).expand(batch, K, K, K, K)
+    box_ct_xs = ((l_xs + r_xs + 0.5) / 2).long()
+    box_ct_ys = ((t_ys + b_ys + 0.5) / 2).long()
+    ct_inds = t_clses.long() * (height * width) + box_ct_ys * width + box_ct_xs
+    ct_inds = ct_inds.view(batch, -1)
+    ct_heat = ct_heat.view(batch, -1, 1)
+    ct_scores = _gather_feat(ct_heat, ct_inds)
+
+    t_scores = t_scores.view(batch, K, 1, 1, 1).expand(batch, K, K, K, K)
+    l_scores = l_scores.view(batch, 1, K, 1, 1).expand(batch, K, K, K, K)
+    b_scores = b_scores.view(batch, 1, 1, K, 1).expand(batch, K, K, K, K)
+    r_scores = r_scores.view(batch, 1, 1, 1, K).expand(batch, K, K, K, K)
+    ct_scores = ct_scores.view(batch, K, K, K, K)
+    scores = (t_scores + l_scores + b_scores + r_scores + 2 * ct_scores) / 6
+
+    # reject boxes based on classes
+    cls_inds = (t_clses != l_clses) + (t_clses != b_clses) + \
+               (t_clses != r_clses)
+    cls_inds = (cls_inds > 0)
+
+    top_inds = (t_ys > l_ys) + (t_ys > b_ys) + (t_ys > r_ys)
+    top_inds = (top_inds > 0)
+    left_inds = (l_xs > t_xs) + (l_xs > b_xs) + (l_xs > r_xs)
+    left_inds = (left_inds > 0)
+    bottom_inds = (b_ys < t_ys) + (b_ys < l_ys) + (b_ys < r_ys)
+    bottom_inds = (bottom_inds > 0)
+    right_inds = (r_xs < t_xs) + (r_xs < l_xs) + (r_xs < b_xs)
+    right_inds = (right_inds > 0)
+
+    sc_inds = (t_scores < scores_thresh) + (l_scores < scores_thresh) + \
+              (b_scores < scores_thresh) + (r_scores < scores_thresh) + \
+              (ct_scores < center_thresh)
+    sc_inds = (sc_inds > 0)
+
+    scores = scores - sc_inds.float()
+    scores = scores - cls_inds.float()
+    scores = scores - top_inds.float()
+    scores = scores - left_inds.float()
+    scores = scores - bottom_inds.float()
+    scores = scores - right_inds.float()
+
+    scores = scores.view(batch, -1)
+    scores, inds = torch.topk(scores, num_dets)
+    scores = scores.unsqueeze(2)
+
+    if t_regr is not None and l_regr is not None \
+            and b_regr is not None and r_regr is not None:
+        t_regr = _transpose_and_gather_feat(t_regr, t_inds)
+        t_regr = t_regr.view(batch, K, 1, 1, 1, 2)
+        l_regr = _transpose_and_gather_feat(l_regr, l_inds)
+        l_regr = l_regr.view(batch, 1, K, 1, 1, 2)
+        b_regr = _transpose_and_gather_feat(b_regr, b_inds)
+        b_regr = b_regr.view(batch, 1, 1, K, 1, 2)
+        r_regr = _transpose_and_gather_feat(r_regr, r_inds)
+        r_regr = r_regr.view(batch, 1, 1, 1, K, 2)
+
+        t_xs = t_xs + t_regr[..., 0]
+        t_ys = t_ys + t_regr[..., 1]
+        l_xs = l_xs + l_regr[..., 0]
+        l_ys = l_ys + l_regr[..., 1]
+        b_xs = b_xs + b_regr[..., 0]
+        b_ys = b_ys + b_regr[..., 1]
+        r_xs = r_xs + r_regr[..., 0]
+        r_ys = r_ys + r_regr[..., 1]
+    else:
+        t_xs = t_xs + 0.5
+        t_ys = t_ys + 0.5
+        l_xs = l_xs + 0.5
+        l_ys = l_ys + 0.5
+        b_xs = b_xs + 0.5
+        b_ys = b_ys + 0.5
+        r_xs = r_xs + 0.5
+        r_ys = r_ys + 0.5
+
+    bboxes = torch.stack((l_xs, t_ys, r_xs, b_ys), dim=5)
+    bboxes = bboxes.view(batch, -1, 4)
+    bboxes = _gather_feat(bboxes, inds)
+
+    clses = t_clses.contiguous().view(batch, -1, 1)
+    clses = _gather_feat(clses, inds).float()
+
+    t_xs = t_xs.contiguous().view(batch, -1, 1)
+    t_xs = _gather_feat(t_xs, inds).float()
+    t_ys = t_ys.contiguous().view(batch, -1, 1)
+    t_ys = _gather_feat(t_ys, inds).float()
+    l_xs = l_xs.contiguous().view(batch, -1, 1)
+    l_xs = _gather_feat(l_xs, inds).float()
+    l_ys = l_ys.contiguous().view(batch, -1, 1)
+    l_ys = _gather_feat(l_ys, inds).float()
+    b_xs = b_xs.contiguous().view(batch, -1, 1)
+    b_xs = _gather_feat(b_xs, inds).float()
+    b_ys = b_ys.contiguous().view(batch, -1, 1)
+    b_ys = _gather_feat(b_ys, inds).float()
+    r_xs = r_xs.contiguous().view(batch, -1, 1)
+    r_xs = _gather_feat(r_xs, inds).float()
+    r_ys = r_ys.contiguous().view(batch, -1, 1)
+    r_ys = _gather_feat(r_ys, inds).float()
+    detections = torch.cat([bboxes, scores, t_xs, t_ys, l_xs, l_ys,
+                            b_xs, b_ys, r_xs, r_ys, clses], dim=2)
+
+    return detections
+
+
+def ddd_decode(heat, rot, depth, dim, wh=None, reg=None, K=40):
+    batch, cat, height, width = heat.size()
+    # heat = torch.sigmoid(heat)
+    # perform nms on heatmaps
+    heat = _nms(heat)
+
+    scores, inds, clses, ys, xs = _topk(heat, K=K)
+    if reg is not None:
+        reg = _transpose_and_gather_feat(reg, inds)
+        reg = reg.view(batch, K, 2)
+        xs = xs.view(batch, K, 1) + reg[:, :, 0:1]
+        ys = ys.view(batch, K, 1) + reg[:, :, 1:2]
+    else:
+        xs = xs.view(batch, K, 1) + 0.5
+        ys = ys.view(batch, K, 1) + 0.5
+
+    rot = _transpose_and_gather_feat(rot, inds)
+    rot = rot.view(batch, K, 8)
+    depth = _transpose_and_gather_feat(depth, inds)
+    depth = depth.view(batch, K, 1)
+    dim = _transpose_and_gather_feat(dim, inds)
+    dim = dim.view(batch, K, 3)
+    clses = clses.view(batch, K, 1).float()
+    scores = scores.view(batch, K, 1)
+    xs = xs.view(batch, K, 1)
+    ys = ys.view(batch, K, 1)
+
+    if wh is not None:
+        wh = _transpose_and_gather_feat(wh, inds)
+        wh = wh.view(batch, K, 2)
+        detections = torch.cat(
+            [xs, ys, scores, rot, depth, dim, wh, clses], dim=2)
+    else:
+        detections = torch.cat(
+            [xs, ys, scores, rot, depth, dim, clses], dim=2)
+
+    return detections
+
+
+def ctdet_decode(heat, wh, reg=None, cat_spec_wh=False, K=100):
+    batch, cat, height, width = heat.size()
+
+    # heat = torch.sigmoid(heat)
+    # perform nms on heatmaps
+    heat = _nms(heat)
+
+    scores, inds, clses, ys, xs = _topk(heat, K=K)
+    if reg is not None:
+        reg = _transpose_and_gather_feat(reg, inds)
+        reg = reg.view(batch, K, 2)
+        xs = xs.view(batch, K, 1) + reg[:, :, 0:1]
+        ys = ys.view(batch, K, 1) + reg[:, :, 1:2]
+    else:
+        xs = xs.view(batch, K, 1) + 0.5
+        ys = ys.view(batch, K, 1) + 0.5
+    wh = _transpose_and_gather_feat(wh, inds)
+    if cat_spec_wh:
+        wh = wh.view(batch, K, cat, 2)
+        clses_ind = clses.view(batch, K, 1, 1).expand(batch, K, 1, 2).long()
+        wh = wh.gather(2, clses_ind).view(batch, K, 2)
+    else:
+        wh = wh.view(batch, K, 2)
+    clses = clses.view(batch, K, 1).float()
+    scores = scores.view(batch, K, 1)
+    bboxes = torch.cat([xs - wh[..., 0:1] / 2,
+                        ys - wh[..., 1:2] / 2,
+                        xs + wh[..., 0:1] / 2,
+                        ys + wh[..., 1:2] / 2], dim=2)
+    detections = torch.cat([bboxes, scores, clses], dim=2)
+
+    return detections
+
+
+def ctseg_decode(heat, wh, seg_feat, conv_weight, reg=None, cat_spec_wh=False, K=100):
+    batch, cat, height, width = heat.size()
+
+    # heat = torch.sigmoid(heat)
+    # perform nms on heatmaps
+    heat = _nms(heat)
+
+    scores, inds, clses, ys, xs = _topk(heat, K=K)
+    if reg is not None:
+        reg = _transpose_and_gather_feat(reg, inds)
+        reg = reg.view(batch, K, 2)
+        xs = xs.view(batch, K, 1) + reg[:, :, 0:1]
+        ys = ys.view(batch, K, 1) + reg[:, :, 1:2]
+    else:
+        xs = xs.view(batch, K, 1) + 0.5
+        ys = ys.view(batch, K, 1) + 0.5
+    wh = _transpose_and_gather_feat(wh, inds)
+    if cat_spec_wh:
+        wh = wh.view(batch, K, cat, 2)
+        clses_ind = clses.view(batch, K, 1, 1).expand(batch, K, 1, 2).long()
+        wh = wh.gather(2, clses_ind).view(batch, K, 2)
+    else:
+        wh = wh.view(batch, K, 2)
+    clses = clses.view(batch, K, 1).float()
+    scores = scores.view(batch, K, 1)
+    bboxes = torch.cat([xs - wh[..., 0:1] / 2,
+                        ys - wh[..., 1:2] / 2,
+                        xs + wh[..., 0:1] / 2,
+                        ys + wh[..., 1:2] / 2], dim=2)
+
+    detections = torch.cat([bboxes, scores, clses], dim=2)
+    feat_channel = seg_feat.size(1)
+    h, w = seg_feat.size(-2), seg_feat.size(-1)
+    mask = torch.zeros((batch, K, h, w)).to(device=seg_feat.device)
+    x_range = torch.arange(w).float().to(device=seg_feat.device)
+    y_range = torch.arange(h).float().to(device=seg_feat.device)
+    y_grid, x_grid = torch.meshgrid([y_range, x_range])
+    weight = _transpose_and_gather_feat(conv_weight, inds)
+    for i in range(batch):
+        conv1w, conv1b, conv2w, conv2b, conv3w, conv3b = \
+            torch.split(weight[i], [(feat_channel + 2) * feat_channel, feat_channel,
+                                    feat_channel ** 2, feat_channel,
+                                    feat_channel, 1], dim=-1)
+        y_rel_coord = (y_grid[None, None] -
+                       ys[i].unsqueeze(-1).unsqueeze(-1).float()) / 128.
+        x_rel_coord = (x_grid[None, None] -
+                       xs[i].unsqueeze(-1).unsqueeze(-1).float()) / 128.
+        feat = seg_feat[i][None].repeat([K, 1, 1, 1])
+        feat = torch.cat([feat, x_rel_coord, y_rel_coord],
+                         dim=1).view(1, -1, h, w)
+
+        conv1w = conv1w.contiguous().view(-1, feat_channel + 2, 1, 1)
+        conv1b = conv1b.contiguous().flatten()
+        feat = F.conv2d(feat, conv1w, conv1b, groups=K).relu()
+
+        conv2w = conv2w.contiguous().view(-1, feat_channel, 1, 1)
+        conv2b = conv2b.contiguous().flatten()
+        feat = F.conv2d(feat, conv2w, conv2b, groups=K).relu()
+
+        conv3w = conv3w.contiguous().view(-1, feat_channel, 1, 1)
+        conv3b = conv3b.contiguous().flatten()
+        feat = F.conv2d(feat, conv3w, conv3b, groups=K).sigmoid().squeeze()
+        mask[i] = feat
+
+    return detections, mask
+
+
+def multi_pose_decode(
+        heat, wh, kps, reg=None, hm_hp=None, hp_offset=None, K=100):
+    batch, cat, height, width = heat.size()
+    num_joints = kps.shape[1] // 2
+    # heat = torch.sigmoid(heat)
+    # perform nms on heatmaps
+    heat = _nms(heat)
+    scores, inds, clses, ys, xs = _topk(heat, K=K)
+
+    kps = _transpose_and_gather_feat(kps, inds)
+    kps = kps.view(batch, K, num_joints * 2)
+    kps[..., ::2] += xs.view(batch, K, 1).expand(batch, K, num_joints)
+    kps[..., 1::2] += ys.view(batch, K, 1).expand(batch, K, num_joints)
+    if reg is not None:
+        reg = _transpose_and_gather_feat(reg, inds)
+        reg = reg.view(batch, K, 2)
+        xs = xs.view(batch, K, 1) + reg[:, :, 0:1]
+        ys = ys.view(batch, K, 1) + reg[:, :, 1:2]
+    else:
+        xs = xs.view(batch, K, 1) + 0.5
+        ys = ys.view(batch, K, 1) + 0.5
+    wh = _transpose_and_gather_feat(wh, inds)
+    wh = wh.view(batch, K, 2)
+    clses = clses.view(batch, K, 1).float()
+    scores = scores.view(batch, K, 1)
+
+    bboxes = torch.cat([xs - wh[..., 0:1] / 2,
+                        ys - wh[..., 1:2] / 2,
+                        xs + wh[..., 0:1] / 2,
+                        ys + wh[..., 1:2] / 2], dim=2)
+    if hm_hp is not None:
+        hm_hp = _nms(hm_hp)
+        thresh = 0.1
+        kps = kps.view(batch, K, num_joints, 2).permute(
+            0, 2, 1, 3).contiguous()  # b x J x K x 2
+        reg_kps = kps.unsqueeze(3).expand(batch, num_joints, K, K, 2)
+        hm_score, hm_inds, hm_ys, hm_xs = _topk_channel(
+            hm_hp, K=K)  # b x J x K
+        if hp_offset is not None:
+            hp_offset = _transpose_and_gather_feat(
+                hp_offset, hm_inds.view(batch, -1))
+            hp_offset = hp_offset.view(batch, num_joints, K, 2)
+            hm_xs = hm_xs + hp_offset[:, :, :, 0]
+            hm_ys = hm_ys + hp_offset[:, :, :, 1]
+        else:
+            hm_xs = hm_xs + 0.5
+            hm_ys = hm_ys + 0.5
+
+        mask = (hm_score > thresh).float()
+        hm_score = (1 - mask) * -1 + mask * hm_score
+        hm_ys = (1 - mask) * (-10000) + mask * hm_ys
+        hm_xs = (1 - mask) * (-10000) + mask * hm_xs
+        hm_kps = torch.stack([hm_xs, hm_ys], dim=-1).unsqueeze(
+            2).expand(batch, num_joints, K, K, 2)
+        dist = (((reg_kps - hm_kps) ** 2).sum(dim=4) ** 0.5)
+        min_dist, min_ind = dist.min(dim=3)  # b x J x K
+        hm_score = hm_score.gather(2, min_ind).unsqueeze(-1)  # b x J x K x 1
+        min_dist = min_dist.unsqueeze(-1)
+        min_ind = min_ind.view(batch, num_joints, K, 1, 1).expand(
+            batch, num_joints, K, 1, 2)
+        hm_kps = hm_kps.gather(3, min_ind)
+        hm_kps = hm_kps.view(batch, num_joints, K, 2)
+        l = bboxes[:, :, 0].view(batch, 1, K, 1).expand(
+            batch, num_joints, K, 1)
+        t = bboxes[:, :, 1].view(batch, 1, K, 1).expand(
+            batch, num_joints, K, 1)
+        r = bboxes[:, :, 2].view(batch, 1, K, 1).expand(
+            batch, num_joints, K, 1)
+        b = bboxes[:, :, 3].view(batch, 1, K, 1).expand(
+            batch, num_joints, K, 1)
+        mask = (hm_kps[..., 0:1] < l) + (hm_kps[..., 0:1] > r) + \
+               (hm_kps[..., 1:2] < t) + (hm_kps[..., 1:2] > b) + \
+               (hm_score < thresh) + (min_dist > (torch.max(b - t, r - l) * 0.3))
+        mask = (mask > 0).float().expand(batch, num_joints, K, 2)
+        kps = (1 - mask) * hm_kps + mask * kps
+        kps = kps.permute(0, 2, 1, 3).contiguous().view(
+            batch, K, num_joints * 2)
+    detections = torch.cat([bboxes, scores, kps, clses], dim=2)
+
+    return detections
diff --git a/src/lib/models/losses.py b/src/lib/models/losses.py
new file mode 100644
index 0000000..95cdbcc
--- /dev/null
+++ b/src/lib/models/losses.py
@@ -0,0 +1,310 @@
+# ------------------------------------------------------------------------------
+# Portions of this code are from
+# CornerNet (https://github.com/princeton-vl/CornerNet)
+# Copyright (c) 2018, University of Michigan
+# Licensed under the BSD 3-Clause License
+# ------------------------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import torch.nn as nn
+from .utils import _transpose_and_gather_feat
+import torch.nn.functional as F
+
+
+def _slow_neg_loss(pred, gt):
+    '''focal loss from CornerNet'''
+    pos_inds = gt.eq(1)
+    neg_inds = gt.lt(1)
+
+    neg_weights = torch.pow(1 - gt[neg_inds], 4)
+
+    loss = 0
+    pos_pred = pred[pos_inds]
+    neg_pred = pred[neg_inds]
+
+    pos_loss = torch.log(pos_pred) * torch.pow(1 - pos_pred, 2)
+    neg_loss = torch.log(1 - neg_pred) * torch.pow(neg_pred, 2) * neg_weights
+
+    num_pos = pos_inds.float().sum()
+    pos_loss = pos_loss.sum()
+    neg_loss = neg_loss.sum()
+
+    if pos_pred.nelement() == 0:
+        loss = loss - neg_loss
+    else:
+        loss = loss - (pos_loss + neg_loss) / num_pos
+    return loss
+
+
+def _neg_loss(pred, gt):
+    ''' Modified focal loss. Exactly the same as CornerNet.
+        Runs faster and costs a little bit more memory
+      Arguments:
+        pred (batch x c x h x w)
+        gt_regr (batch x c x h x w)
+    '''
+    pos_inds = gt.eq(1).float()
+    neg_inds = gt.lt(1).float()
+
+    neg_weights = torch.pow(1 - gt, 4)
+
+    loss = 0
+
+    pos_loss = torch.log(pred) * torch.pow(1 - pred, 2) * pos_inds
+    neg_loss = torch.log(1 - pred) * torch.pow(pred, 2) * \
+        neg_weights * neg_inds
+
+    num_pos = pos_inds.float().sum()
+    pos_loss = pos_loss.sum()
+    neg_loss = neg_loss.sum()
+
+    if num_pos == 0:
+        loss = loss - neg_loss
+    else:
+        loss = loss - (pos_loss + neg_loss) / num_pos
+    return loss
+
+
+def _not_faster_neg_loss(pred, gt):
+    pos_inds = gt.eq(1).float()
+    neg_inds = gt.lt(1).float()
+    num_pos = pos_inds.float().sum()
+    neg_weights = torch.pow(1 - gt, 4)
+
+    loss = 0
+    trans_pred = pred * neg_inds + (1 - pred) * pos_inds
+    weight = neg_weights * neg_inds + pos_inds
+    all_loss = torch.log(1 - trans_pred) * torch.pow(trans_pred, 2) * weight
+    all_loss = all_loss.sum()
+
+    if num_pos > 0:
+        all_loss /= num_pos
+    loss -= all_loss
+    return loss
+
+
+def _slow_reg_loss(regr, gt_regr, mask):
+    num = mask.float().sum()
+    mask = mask.unsqueeze(2).expand_as(gt_regr)
+
+    regr = regr[mask]
+    gt_regr = gt_regr[mask]
+
+    regr_loss = nn.functional.smooth_l1_loss(regr, gt_regr, size_average=False)
+    regr_loss = regr_loss / (num + 1e-4)
+    return regr_loss
+
+
+def _reg_loss(regr, gt_regr, mask):
+    ''' L1 regression loss
+      Arguments:
+        regr (batch x max_objects x dim)
+        gt_regr (batch x max_objects x dim)
+        mask (batch x max_objects)
+    '''
+    num = mask.float().sum()
+    mask = mask.unsqueeze(2).expand_as(gt_regr).float()
+
+    regr = regr * mask
+    gt_regr = gt_regr * mask
+
+    regr_loss = nn.functional.smooth_l1_loss(regr, gt_regr, size_average=False)
+    regr_loss = regr_loss / (num + 1e-4)
+    return regr_loss
+
+
+class FocalLoss(nn.Module):
+    '''nn.Module warpper for focal loss'''
+
+    def __init__(self):
+        super(FocalLoss, self).__init__()
+        self.neg_loss = _neg_loss
+
+    def forward(self, out, target):
+        return self.neg_loss(out, target)
+
+
+class RegLoss(nn.Module):
+    '''Regression loss for an output tensor
+      Arguments:
+        output (batch x dim x h x w)
+        mask (batch x max_objects)
+        ind (batch x max_objects)
+        target (batch x max_objects x dim)
+    '''
+
+    def __init__(self):
+        super(RegLoss, self).__init__()
+
+    def forward(self, output, mask, ind, target):
+        pred = _transpose_and_gather_feat(output, ind)
+        loss = _reg_loss(pred, target, mask)
+        return loss
+
+
+class RegL1Loss(nn.Module):
+    def __init__(self):
+        super(RegL1Loss, self).__init__()
+
+    def forward(self, output, mask, ind, target):
+        pred = _transpose_and_gather_feat(output, ind)
+        mask = mask.unsqueeze(2).expand_as(pred).float()
+        # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean')
+        loss = F.l1_loss(pred * mask, target * mask, size_average=False)
+        loss = loss / (mask.sum() + 1e-4)
+        return loss
+
+
+def dice_loss(input, target):
+    smooth = 1.
+    iflat = input.contiguous().view(-1)
+    tflat = target.contiguous().view(-1)
+    intersection = (iflat * tflat).sum()
+    return 1 - ((2. * intersection + smooth) / ((iflat*iflat).sum() + (tflat*tflat).sum() + smooth))
+
+
+class DiceLoss(nn.Module):
+    def __init__(self, feat_channel):
+        super(DiceLoss, self).__init__()
+        self.feat_channel = feat_channel
+
+    def forward(self, seg_feat, conv_weight, mask, ind, target):
+        mask_loss = 0.
+        batch_size = seg_feat.size(0)
+        weight = _transpose_and_gather_feat(conv_weight, ind)
+        h, w = seg_feat.size(-2), seg_feat.size(-1)
+        x, y = ind % w, ind/w
+        x_range = torch.arange(w).float().to(device=seg_feat.device)
+        y_range = torch.arange(h).float().to(device=seg_feat.device)
+        y_grid, x_grid = torch.meshgrid([y_range, x_range])
+        for i in range(batch_size):
+            num_obj = target[i].size(0)
+            conv1w, conv1b, conv2w, conv2b, conv3w, conv3b = \
+                torch.split(weight[i, :num_obj], [(self.feat_channel+2)*self.feat_channel, self.feat_channel,
+                                                  self.feat_channel**2, self.feat_channel,
+                                                  self.feat_channel, 1], dim=-1)
+            y_rel_coord = (
+                y_grid[None, None] - y[i, :num_obj].unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).float())/128.
+            x_rel_coord = (
+                x_grid[None, None] - x[i, :num_obj].unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).float())/128.
+            feat = seg_feat[i][None].repeat([num_obj, 1, 1, 1])
+            feat = torch.cat([feat, x_rel_coord, y_rel_coord],
+                             dim=1).view(1, -1, h, w)
+
+            conv1w = conv1w.contiguous().view(-1, self.feat_channel+2, 1, 1)
+            conv1b = conv1b.contiguous().flatten()
+            feat = F.conv2d(feat, conv1w, conv1b, groups=num_obj).relu()
+
+            conv2w = conv2w.contiguous().view(-1, self.feat_channel, 1, 1)
+            conv2b = conv2b.contiguous().flatten()
+            feat = F.conv2d(feat, conv2w, conv2b, groups=num_obj).relu()
+
+            conv3w = conv3w.contiguous().view(-1, self.feat_channel, 1, 1)
+            conv3b = conv3b.contiguous().flatten()
+            feat = F.conv2d(feat, conv3w, conv3b,
+                            groups=num_obj).sigmoid().squeeze()
+
+            true_mask = mask[i, :num_obj, None, None].float()
+            mask_loss += dice_loss(feat*true_mask, target[i]*true_mask)
+
+        return mask_loss/batch_size
+
+
+class NormRegL1Loss(nn.Module):
+    def __init__(self):
+        super(NormRegL1Loss, self).__init__()
+
+    def forward(self, output, mask, ind, target):
+        pred = _transpose_and_gather_feat(output, ind)
+        mask = mask.unsqueeze(2).expand_as(pred).float()
+        # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean')
+        pred = pred / (target + 1e-4)
+        target = target * 0 + 1
+        loss = F.l1_loss(pred * mask, target * mask, size_average=False)
+        loss = loss / (mask.sum() + 1e-4)
+        return loss
+
+
+class RegWeightedL1Loss(nn.Module):
+    def __init__(self):
+        super(RegWeightedL1Loss, self).__init__()
+
+    def forward(self, output, mask, ind, target):
+        pred = _transpose_and_gather_feat(output, ind)
+        mask = mask.float()
+        # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean')
+        loss = F.l1_loss(pred * mask, target * mask, size_average=False)
+        loss = loss / (mask.sum() + 1e-4)
+        return loss
+
+
+class L1Loss(nn.Module):
+    def __init__(self):
+        super(L1Loss, self).__init__()
+
+    def forward(self, output, mask, ind, target):
+        pred = _transpose_and_gather_feat(output, ind)
+        mask = mask.unsqueeze(2).expand_as(pred).float()
+        loss = F.l1_loss(pred * mask, target * mask,
+                         reduction='elementwise_mean')
+        return loss
+
+
+class BinRotLoss(nn.Module):
+    def __init__(self):
+        super(BinRotLoss, self).__init__()
+
+    def forward(self, output, mask, ind, rotbin, rotres):
+        pred = _transpose_and_gather_feat(output, ind)
+        loss = compute_rot_loss(pred, rotbin, rotres, mask)
+        return loss
+
+
+def compute_res_loss(output, target):
+    return F.smooth_l1_loss(output, target, reduction='elementwise_mean')
+
+# TODO: weight
+
+
+def compute_bin_loss(output, target, mask):
+    mask = mask.expand_as(output)
+    output = output * mask.float()
+    return F.cross_entropy(output, target, reduction='elementwise_mean')
+
+
+def compute_rot_loss(output, target_bin, target_res, mask):
+    # output: (B, 128, 8) [bin1_cls[0], bin1_cls[1], bin1_sin, bin1_cos,
+    #                 bin2_cls[0], bin2_cls[1], bin2_sin, bin2_cos]
+    # target_bin: (B, 128, 2) [bin1_cls, bin2_cls]
+    # target_res: (B, 128, 2) [bin1_res, bin2_res]
+    # mask: (B, 128, 1)
+    # import pdb; pdb.set_trace()
+    output = output.view(-1, 8)
+    target_bin = target_bin.view(-1, 2)
+    target_res = target_res.view(-1, 2)
+    mask = mask.view(-1, 1)
+    loss_bin1 = compute_bin_loss(output[:, 0:2], target_bin[:, 0], mask)
+    loss_bin2 = compute_bin_loss(output[:, 4:6], target_bin[:, 1], mask)
+    loss_res = torch.zeros_like(loss_bin1)
+    if target_bin[:, 0].nonzero().shape[0] > 0:
+        idx1 = target_bin[:, 0].nonzero()[:, 0]
+        valid_output1 = torch.index_select(output, 0, idx1.long())
+        valid_target_res1 = torch.index_select(target_res, 0, idx1.long())
+        loss_sin1 = compute_res_loss(
+            valid_output1[:, 2], torch.sin(valid_target_res1[:, 0]))
+        loss_cos1 = compute_res_loss(
+            valid_output1[:, 3], torch.cos(valid_target_res1[:, 0]))
+        loss_res += loss_sin1 + loss_cos1
+    if target_bin[:, 1].nonzero().shape[0] > 0:
+        idx2 = target_bin[:, 1].nonzero()[:, 0]
+        valid_output2 = torch.index_select(output, 0, idx2.long())
+        valid_target_res2 = torch.index_select(target_res, 0, idx2.long())
+        loss_sin2 = compute_res_loss(
+            valid_output2[:, 6], torch.sin(valid_target_res2[:, 1]))
+        loss_cos2 = compute_res_loss(
+            valid_output2[:, 7], torch.cos(valid_target_res2[:, 1]))
+        loss_res += loss_sin2 + loss_cos2
+    return loss_bin1 + loss_bin2 + loss_res
diff --git a/src/lib/models/model.py b/src/lib/models/model.py
new file mode 100644
index 0000000..643660e
--- /dev/null
+++ b/src/lib/models/model.py
@@ -0,0 +1,96 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torchvision.models as models
+import torch
+import torch.nn as nn
+import os
+
+from .networks.msra_resnet import get_pose_net
+from .networks.dlav0 import get_pose_net as get_dlav0
+from .networks.pose_dla_dcn import get_pose_net as get_dla_dcn
+from .networks.resnet_dcn import get_pose_net as get_pose_net_dcn
+from .networks.large_hourglass import get_large_hourglass_net
+
+_model_factory = {
+  'res': get_pose_net, # default Resnet with deconv
+  'dlav0': get_dlav0, # default DLAup
+  'dla': get_dla_dcn,
+  'resdcn': get_pose_net_dcn,
+  'hourglass': get_large_hourglass_net,
+}
+
+def create_model(arch, heads, head_conv):
+  num_layers = int(arch[arch.find('_') + 1:]) if '_' in arch else 0
+  arch = arch[:arch.find('_')] if '_' in arch else arch
+  get_model = _model_factory[arch]
+  model = get_model(num_layers=num_layers, heads=heads, head_conv=head_conv)
+  return model
+
+def load_model(model, model_path, optimizer=None, resume=False, 
+               lr=None, lr_step=None):
+  start_epoch = 0
+  checkpoint = torch.load(model_path, map_location=lambda storage, loc: storage)
+  print('loaded {}, epoch {}'.format(model_path, checkpoint['epoch']))
+  state_dict_ = checkpoint['state_dict']
+  state_dict = {}
+  
+  # convert data_parallal to model
+  for k in state_dict_:
+    if k.startswith('module') and not k.startswith('module_list'):
+      state_dict[k[7:]] = state_dict_[k]
+    else:
+      state_dict[k] = state_dict_[k]
+  model_state_dict = model.state_dict()
+
+  # check loaded parameters and created model parameters
+  msg = 'If you see this, your model does not fully load the ' + \
+        'pre-trained weight. Please make sure ' + \
+        'you have correctly specified --arch xxx ' + \
+        'or set the correct --num_classes for your own dataset.'
+  for k in state_dict:
+    if k in model_state_dict:
+      if state_dict[k].shape != model_state_dict[k].shape:
+        print('Skip loading parameter {}, required shape{}, '\
+              'loaded shape{}. {}'.format(
+          k, model_state_dict[k].shape, state_dict[k].shape, msg))
+        state_dict[k] = model_state_dict[k]
+    else:
+      print('Drop parameter {}.'.format(k) + msg)
+  for k in model_state_dict:
+    if not (k in state_dict):
+      print('No param {}.'.format(k) + msg)
+      state_dict[k] = model_state_dict[k]
+  model.load_state_dict(state_dict, strict=False)
+
+  # resume optimizer parameters
+  if optimizer is not None and resume:
+    if 'optimizer' in checkpoint:
+      optimizer.load_state_dict(checkpoint['optimizer'])
+      start_epoch = checkpoint['epoch']
+      start_lr = lr
+      for step in lr_step:
+        if start_epoch >= step:
+          start_lr *= 0.1
+      for param_group in optimizer.param_groups:
+        param_group['lr'] = start_lr
+      print('Resumed optimizer with start lr', start_lr)
+    else:
+      print('No optimizer parameters in checkpoint.')
+  if optimizer is not None:
+    return model, optimizer, start_epoch
+  else:
+    return model
+
+def save_model(path, epoch, model, optimizer=None):
+  if isinstance(model, torch.nn.DataParallel):
+    state_dict = model.module.state_dict()
+  else:
+    state_dict = model.state_dict()
+  data = {'epoch': epoch,
+          'state_dict': state_dict}
+  if not (optimizer is None):
+    data['optimizer'] = optimizer.state_dict()
+  torch.save(data, path)
+
diff --git a/src/lib/models/networks/DCNv2 b/src/lib/models/networks/DCNv2
new file mode 160000
index 0000000..c7f778f
--- /dev/null
+++ b/src/lib/models/networks/DCNv2
@@ -0,0 +1 @@
+Subproject commit c7f778f28b84c66d3af2bf16f19148a07051dac1
diff --git a/src/lib/models/networks/dlav0.py b/src/lib/models/networks/dlav0.py
new file mode 100644
index 0000000..3ff343c
--- /dev/null
+++ b/src/lib/models/networks/dlav0.py
@@ -0,0 +1,647 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+from os.path import join
+
+import torch
+from torch import nn
+import torch.utils.model_zoo as model_zoo
+
+import numpy as np
+
+BatchNorm = nn.BatchNorm2d
+
+def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'):
+    return join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash))
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3,
+                               stride=stride, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn1 = BatchNorm(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
+                               stride=1, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn2 = BatchNorm(planes)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 2
+
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(Bottleneck, self).__init__()
+        expansion = Bottleneck.expansion
+        bottle_planes = planes // expansion
+        self.conv1 = nn.Conv2d(inplanes, bottle_planes,
+                               kernel_size=1, bias=False)
+        self.bn1 = BatchNorm(bottle_planes)
+        self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
+                               stride=stride, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn2 = BatchNorm(bottle_planes)
+        self.conv3 = nn.Conv2d(bottle_planes, planes,
+                               kernel_size=1, bias=False)
+        self.bn3 = BatchNorm(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class BottleneckX(nn.Module):
+    expansion = 2
+    cardinality = 32
+
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(BottleneckX, self).__init__()
+        cardinality = BottleneckX.cardinality
+        # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0)))
+        # bottle_planes = dim * cardinality
+        bottle_planes = planes * cardinality // 32
+        self.conv1 = nn.Conv2d(inplanes, bottle_planes,
+                               kernel_size=1, bias=False)
+        self.bn1 = BatchNorm(bottle_planes)
+        self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
+                               stride=stride, padding=dilation, bias=False,
+                               dilation=dilation, groups=cardinality)
+        self.bn2 = BatchNorm(bottle_planes)
+        self.conv3 = nn.Conv2d(bottle_planes, planes,
+                               kernel_size=1, bias=False)
+        self.bn3 = BatchNorm(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Root(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, residual):
+        super(Root, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels, out_channels, 1,
+            stride=1, bias=False, padding=(kernel_size - 1) // 2)
+        self.bn = BatchNorm(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+        self.residual = residual
+
+    def forward(self, *x):
+        children = x
+        x = self.conv(torch.cat(x, 1))
+        x = self.bn(x)
+        if self.residual:
+            x += children[0]
+        x = self.relu(x)
+
+        return x
+
+
+class Tree(nn.Module):
+    def __init__(self, levels, block, in_channels, out_channels, stride=1,
+                 level_root=False, root_dim=0, root_kernel_size=1,
+                 dilation=1, root_residual=False):
+        super(Tree, self).__init__()
+        if root_dim == 0:
+            root_dim = 2 * out_channels
+        if level_root:
+            root_dim += in_channels
+        if levels == 1:
+            self.tree1 = block(in_channels, out_channels, stride,
+                               dilation=dilation)
+            self.tree2 = block(out_channels, out_channels, 1,
+                               dilation=dilation)
+        else:
+            self.tree1 = Tree(levels - 1, block, in_channels, out_channels,
+                              stride, root_dim=0,
+                              root_kernel_size=root_kernel_size,
+                              dilation=dilation, root_residual=root_residual)
+            self.tree2 = Tree(levels - 1, block, out_channels, out_channels,
+                              root_dim=root_dim + out_channels,
+                              root_kernel_size=root_kernel_size,
+                              dilation=dilation, root_residual=root_residual)
+        if levels == 1:
+            self.root = Root(root_dim, out_channels, root_kernel_size,
+                             root_residual)
+        self.level_root = level_root
+        self.root_dim = root_dim
+        self.downsample = None
+        self.project = None
+        self.levels = levels
+        if stride > 1:
+            self.downsample = nn.MaxPool2d(stride, stride=stride)
+        if in_channels != out_channels:
+            self.project = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels,
+                          kernel_size=1, stride=1, bias=False),
+                BatchNorm(out_channels)
+            )
+
+    def forward(self, x, residual=None, children=None):
+        children = [] if children is None else children
+        bottom = self.downsample(x) if self.downsample else x
+        residual = self.project(bottom) if self.project else bottom
+        if self.level_root:
+            children.append(bottom)
+        x1 = self.tree1(x, residual)
+        if self.levels == 1:
+            x2 = self.tree2(x1)
+            x = self.root(x2, x1, *children)
+        else:
+            children.append(x1)
+            x = self.tree2(x1, children=children)
+        return x
+
+
+class DLA(nn.Module):
+    def __init__(self, levels, channels, num_classes=1000,
+                 block=BasicBlock, residual_root=False, return_levels=False,
+                 pool_size=7, linear_root=False):
+        super(DLA, self).__init__()
+        self.channels = channels
+        self.return_levels = return_levels
+        self.num_classes = num_classes
+        self.base_layer = nn.Sequential(
+            nn.Conv2d(3, channels[0], kernel_size=7, stride=1,
+                      padding=3, bias=False),
+            BatchNorm(channels[0]),
+            nn.ReLU(inplace=True))
+        self.level0 = self._make_conv_level(
+            channels[0], channels[0], levels[0])
+        self.level1 = self._make_conv_level(
+            channels[0], channels[1], levels[1], stride=2)
+        self.level2 = Tree(levels[2], block, channels[1], channels[2], 2,
+                           level_root=False,
+                           root_residual=residual_root)
+        self.level3 = Tree(levels[3], block, channels[2], channels[3], 2,
+                           level_root=True, root_residual=residual_root)
+        self.level4 = Tree(levels[4], block, channels[3], channels[4], 2,
+                           level_root=True, root_residual=residual_root)
+        self.level5 = Tree(levels[5], block, channels[4], channels[5], 2,
+                           level_root=True, root_residual=residual_root)
+
+        self.avgpool = nn.AvgPool2d(pool_size)
+        self.fc = nn.Conv2d(channels[-1], num_classes, kernel_size=1,
+                            stride=1, padding=0, bias=True)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_level(self, block, inplanes, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes:
+            downsample = nn.Sequential(
+                nn.MaxPool2d(stride, stride=stride),
+                nn.Conv2d(inplanes, planes,
+                          kernel_size=1, stride=1, bias=False),
+                BatchNorm(planes),
+            )
+
+        layers = []
+        layers.append(block(inplanes, planes, stride, downsample=downsample))
+        for i in range(1, blocks):
+            layers.append(block(inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
+        modules = []
+        for i in range(convs):
+            modules.extend([
+                nn.Conv2d(inplanes, planes, kernel_size=3,
+                          stride=stride if i == 0 else 1,
+                          padding=dilation, bias=False, dilation=dilation),
+                BatchNorm(planes),
+                nn.ReLU(inplace=True)])
+            inplanes = planes
+        return nn.Sequential(*modules)
+
+    def forward(self, x):
+        y = []
+        x = self.base_layer(x)
+        for i in range(6):
+            x = getattr(self, 'level{}'.format(i))(x)
+            y.append(x)
+        if self.return_levels:
+            return y
+        else:
+            x = self.avgpool(x)
+            x = self.fc(x)
+            x = x.view(x.size(0), -1)
+
+            return x
+
+    def load_pretrained_model(self,  data='imagenet', name='dla34', hash='ba72cf86'):
+        fc = self.fc
+        if name.endswith('.pth'):
+            model_weights = torch.load(data + name)
+        else:
+            model_url = get_model_url(data, name, hash)
+            model_weights = model_zoo.load_url(model_url)
+        num_classes = len(model_weights[list(model_weights.keys())[-1]])
+        self.fc = nn.Conv2d(
+            self.channels[-1], num_classes,
+            kernel_size=1, stride=1, padding=0, bias=True)
+        self.load_state_dict(model_weights)
+        self.fc = fc
+
+
+def dla34(pretrained, **kwargs):  # DLA-34
+    model = DLA([1, 1, 1, 2, 2, 1],
+                [16, 32, 64, 128, 256, 512],
+                block=BasicBlock, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla34', hash='ba72cf86')
+    return model
+
+
+def dla46_c(pretrained=None, **kwargs):  # DLA-46-C
+    Bottleneck.expansion = 2
+    model = DLA([1, 1, 1, 2, 2, 1],
+                [16, 32, 64, 64, 128, 256],
+                block=Bottleneck, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla46_c')
+    return model
+
+
+def dla46x_c(pretrained=None, **kwargs):  # DLA-X-46-C
+    BottleneckX.expansion = 2
+    model = DLA([1, 1, 1, 2, 2, 1],
+                [16, 32, 64, 64, 128, 256],
+                block=BottleneckX, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla46x_c')
+    return model
+
+
+def dla60x_c(pretrained, **kwargs):  # DLA-X-60-C
+    BottleneckX.expansion = 2
+    model = DLA([1, 1, 1, 2, 3, 1],
+                [16, 32, 64, 64, 128, 256],
+                block=BottleneckX, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla60x_c', hash='b870c45c')
+    return model
+
+
+def dla60(pretrained=None, **kwargs):  # DLA-60
+    Bottleneck.expansion = 2
+    model = DLA([1, 1, 1, 2, 3, 1],
+                [16, 32, 128, 256, 512, 1024],
+                block=Bottleneck, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla60')
+    return model
+
+
+def dla60x(pretrained=None, **kwargs):  # DLA-X-60
+    BottleneckX.expansion = 2
+    model = DLA([1, 1, 1, 2, 3, 1],
+                [16, 32, 128, 256, 512, 1024],
+                block=BottleneckX, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla60x')
+    return model
+
+
+def dla102(pretrained=None, **kwargs):  # DLA-102
+    Bottleneck.expansion = 2
+    model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024],
+                block=Bottleneck, residual_root=True, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla102')
+    return model
+
+
+def dla102x(pretrained=None, **kwargs):  # DLA-X-102
+    BottleneckX.expansion = 2
+    model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024],
+                block=BottleneckX, residual_root=True, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla102x')
+    return model
+
+
+def dla102x2(pretrained=None, **kwargs):  # DLA-X-102 64
+    BottleneckX.cardinality = 64
+    model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024],
+                block=BottleneckX, residual_root=True, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla102x2')
+    return model
+
+
+def dla169(pretrained=None, **kwargs):  # DLA-169
+    Bottleneck.expansion = 2
+    model = DLA([1, 1, 2, 3, 5, 1], [16, 32, 128, 256, 512, 1024],
+                block=Bottleneck, residual_root=True, **kwargs)
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, 'dla169')
+    return model
+
+
+def set_bn(bn):
+    global BatchNorm
+    BatchNorm = bn
+    dla.BatchNorm = bn
+
+
+class Identity(nn.Module):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+def fill_up_weights(up):
+    w = up.weight.data
+    f = math.ceil(w.size(2) / 2)
+    c = (2 * f - 1 - f % 2) / (2. * f)
+    for i in range(w.size(2)):
+        for j in range(w.size(3)):
+            w[0, 0, i, j] = \
+                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
+    for c in range(1, w.size(0)):
+        w[c, 0, :, :] = w[0, 0, :, :]
+
+
+class IDAUp(nn.Module):
+    def __init__(self, node_kernel, out_dim, channels, up_factors):
+        super(IDAUp, self).__init__()
+        self.channels = channels
+        self.out_dim = out_dim
+        for i, c in enumerate(channels):
+            if c == out_dim:
+                proj = Identity()
+            else:
+                proj = nn.Sequential(
+                    nn.Conv2d(c, out_dim,
+                              kernel_size=1, stride=1, bias=False),
+                    BatchNorm(out_dim),
+                    nn.ReLU(inplace=True))
+            f = int(up_factors[i])
+            if f == 1:
+                up = Identity()
+            else:
+                up = nn.ConvTranspose2d(
+                    out_dim, out_dim, f * 2, stride=f, padding=f // 2,
+                    output_padding=0, groups=out_dim, bias=False)
+                fill_up_weights(up)
+            setattr(self, 'proj_' + str(i), proj)
+            setattr(self, 'up_' + str(i), up)
+
+        for i in range(1, len(channels)):
+            node = nn.Sequential(
+                nn.Conv2d(out_dim * 2, out_dim,
+                          kernel_size=node_kernel, stride=1,
+                          padding=node_kernel // 2, bias=False),
+                BatchNorm(out_dim),
+                nn.ReLU(inplace=True))
+            setattr(self, 'node_' + str(i), node)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def forward(self, layers):
+        assert len(self.channels) == len(layers), \
+            '{} vs {} layers'.format(len(self.channels), len(layers))
+        layers = list(layers)
+        for i, l in enumerate(layers):
+            upsample = getattr(self, 'up_' + str(i))
+            project = getattr(self, 'proj_' + str(i))
+            layers[i] = upsample(project(l))
+        x = layers[0]
+        y = []
+        for i in range(1, len(layers)):
+            node = getattr(self, 'node_' + str(i))
+            x = node(torch.cat([x, layers[i]], 1))
+            y.append(x)
+        return x, y
+
+
+class DLAUp(nn.Module):
+    def __init__(self, channels, scales=(1, 2, 4, 8, 16), in_channels=None):
+        super(DLAUp, self).__init__()
+        if in_channels is None:
+            in_channels = channels
+        self.channels = channels
+        channels = list(channels)
+        scales = np.array(scales, dtype=int)
+        for i in range(len(channels) - 1):
+            j = -i - 2
+            setattr(self, 'ida_{}'.format(i),
+                    IDAUp(3, channels[j], in_channels[j:],
+                          scales[j:] // scales[j]))
+            scales[j + 1:] = scales[j]
+            in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]]
+
+    def forward(self, layers):
+        layers = list(layers)
+        assert len(layers) > 1
+        for i in range(len(layers) - 1):
+            ida = getattr(self, 'ida_{}'.format(i))
+            x, y = ida(layers[-i - 2:])
+            layers[-i - 1:] = y
+        return x
+
+def fill_fc_weights(layers):
+    for m in layers.modules():
+        if isinstance(m, nn.Conv2d):
+            nn.init.normal_(m.weight, std=0.001)
+            # torch.nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu')
+            # torch.nn.init.xavier_normal_(m.weight.data)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+class DLASeg(nn.Module):
+    def __init__(self, base_name, heads,
+                 pretrained=True, down_ratio=4, head_conv=256):
+        super(DLASeg, self).__init__()
+        assert down_ratio in [2, 4, 8, 16]
+        self.heads = heads
+        self.first_level = int(np.log2(down_ratio))
+        self.base = globals()[base_name](
+          pretrained=pretrained, return_levels=True)
+        channels = self.base.channels
+        scales = [2 ** i for i in range(len(channels[self.first_level:]))]
+        self.dla_up = DLAUp(channels[self.first_level:], scales=scales)
+        '''
+        self.fc = nn.Sequential(
+            nn.Conv2d(channels[self.first_level], classes, kernel_size=1,
+                      stride=1, padding=0, bias=True)
+        )
+        '''
+
+        for head in self.heads:
+            classes = self.heads[head]
+            if head_conv > 0:
+                fc = nn.Sequential(
+                  nn.Conv2d(channels[self.first_level], head_conv,
+                    kernel_size=3, padding=1, bias=True),
+                  nn.ReLU(inplace=True),
+                  nn.Conv2d(head_conv, classes, 
+                    kernel_size=1, stride=1, 
+                    padding=0, bias=True))
+                if 'hm' in head:
+                    fc[-1].bias.data.fill_(-2.19)
+                else:
+                    fill_fc_weights(fc)
+            else:
+                fc = nn.Conv2d(channels[self.first_level], classes, 
+                  kernel_size=1, stride=1, 
+                  padding=0, bias=True)
+                if 'hm' in head:
+                    fc.bias.data.fill_(-2.19)
+                else:
+                    fill_fc_weights(fc)
+            self.__setattr__(head, fc)
+
+        '''
+        up_factor = 2 ** self.first_level
+        if up_factor > 1:
+            up = nn.ConvTranspose2d(classes, classes, up_factor * 2,
+                                    stride=up_factor, padding=up_factor // 2,
+                                    output_padding=0, groups=classes,
+                                    bias=False)
+            fill_up_weights(up)
+            up.weight.requires_grad = False
+        else:
+            up = Identity()
+        self.up = up
+        self.softmax = nn.LogSoftmax(dim=1)
+        
+
+        for m in self.fc.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+        '''
+
+    def forward(self, x):
+        x = self.base(x)
+        x = self.dla_up(x[self.first_level:])
+        # x = self.fc(x)
+        # y = self.softmax(self.up(x))
+        ret = {}
+        for head in self.heads:
+            ret[head] = self.__getattr__(head)(x)
+        return [ret]
+
+    '''
+    def optim_parameters(self, memo=None):
+        for param in self.base.parameters():
+            yield param
+        for param in self.dla_up.parameters():
+            yield param
+        for param in self.fc.parameters():
+            yield param
+    '''
+'''
+def dla34up(classes, pretrained_base=None, **kwargs):
+    model = DLASeg('dla34', classes, pretrained_base=pretrained_base, **kwargs)
+    return model
+
+
+def dla60up(classes, pretrained_base=None, **kwargs):
+    model = DLASeg('dla60', classes, pretrained_base=pretrained_base, **kwargs)
+    return model
+
+
+def dla102up(classes, pretrained_base=None, **kwargs):
+    model = DLASeg('dla102', classes,
+                   pretrained_base=pretrained_base, **kwargs)
+    return model
+
+
+def dla169up(classes, pretrained_base=None, **kwargs):
+    model = DLASeg('dla169', classes,
+                   pretrained_base=pretrained_base, **kwargs)
+    return model
+'''
+
+def get_pose_net(num_layers, heads, head_conv=256, down_ratio=4):
+  model = DLASeg('dla{}'.format(num_layers), heads,
+                 pretrained=True,
+                 down_ratio=down_ratio,
+                 head_conv=head_conv)
+  return model
diff --git a/src/lib/models/networks/large_hourglass.py b/src/lib/models/networks/large_hourglass.py
new file mode 100644
index 0000000..b40ba72
--- /dev/null
+++ b/src/lib/models/networks/large_hourglass.py
@@ -0,0 +1,300 @@
+# ------------------------------------------------------------------------------
+# This code is base on 
+# CornerNet (https://github.com/princeton-vl/CornerNet)
+# Copyright (c) 2018, University of Michigan
+# Licensed under the BSD 3-Clause License
+# ------------------------------------------------------------------------------
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+class convolution(nn.Module):
+    def __init__(self, k, inp_dim, out_dim, stride=1, with_bn=True):
+        super(convolution, self).__init__()
+
+        pad = (k - 1) // 2
+        self.conv = nn.Conv2d(inp_dim, out_dim, (k, k), padding=(pad, pad), stride=(stride, stride), bias=not with_bn)
+        self.bn   = nn.BatchNorm2d(out_dim) if with_bn else nn.Sequential()
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        conv = self.conv(x)
+        bn   = self.bn(conv)
+        relu = self.relu(bn)
+        return relu
+
+class fully_connected(nn.Module):
+    def __init__(self, inp_dim, out_dim, with_bn=True):
+        super(fully_connected, self).__init__()
+        self.with_bn = with_bn
+
+        self.linear = nn.Linear(inp_dim, out_dim)
+        if self.with_bn:
+            self.bn = nn.BatchNorm1d(out_dim)
+        self.relu   = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        linear = self.linear(x)
+        bn     = self.bn(linear) if self.with_bn else linear
+        relu   = self.relu(bn)
+        return relu
+
+class residual(nn.Module):
+    def __init__(self, k, inp_dim, out_dim, stride=1, with_bn=True):
+        super(residual, self).__init__()
+
+        self.conv1 = nn.Conv2d(inp_dim, out_dim, (3, 3), padding=(1, 1), stride=(stride, stride), bias=False)
+        self.bn1   = nn.BatchNorm2d(out_dim)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.conv2 = nn.Conv2d(out_dim, out_dim, (3, 3), padding=(1, 1), bias=False)
+        self.bn2   = nn.BatchNorm2d(out_dim)
+        
+        self.skip  = nn.Sequential(
+            nn.Conv2d(inp_dim, out_dim, (1, 1), stride=(stride, stride), bias=False),
+            nn.BatchNorm2d(out_dim)
+        ) if stride != 1 or inp_dim != out_dim else nn.Sequential()
+        self.relu  = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        conv1 = self.conv1(x)
+        bn1   = self.bn1(conv1)
+        relu1 = self.relu1(bn1)
+
+        conv2 = self.conv2(relu1)
+        bn2   = self.bn2(conv2)
+
+        skip  = self.skip(x)
+        return self.relu(bn2 + skip)
+
+def make_layer(k, inp_dim, out_dim, modules, layer=convolution, **kwargs):
+    layers = [layer(k, inp_dim, out_dim, **kwargs)]
+    for _ in range(1, modules):
+        layers.append(layer(k, out_dim, out_dim, **kwargs))
+    return nn.Sequential(*layers)
+
+def make_layer_revr(k, inp_dim, out_dim, modules, layer=convolution, **kwargs):
+    layers = []
+    for _ in range(modules - 1):
+        layers.append(layer(k, inp_dim, inp_dim, **kwargs))
+    layers.append(layer(k, inp_dim, out_dim, **kwargs))
+    return nn.Sequential(*layers)
+
+class MergeUp(nn.Module):
+    def forward(self, up1, up2):
+        return up1 + up2
+
+def make_merge_layer(dim):
+    return MergeUp()
+
+# def make_pool_layer(dim):
+#     return nn.MaxPool2d(kernel_size=2, stride=2)
+
+def make_pool_layer(dim):
+    return nn.Sequential()
+
+def make_unpool_layer(dim):
+    return nn.Upsample(scale_factor=2)
+
+def make_kp_layer(cnv_dim, curr_dim, out_dim):
+    return nn.Sequential(
+        convolution(3, cnv_dim, curr_dim, with_bn=False),
+        nn.Conv2d(curr_dim, out_dim, (1, 1))
+    )
+
+def make_inter_layer(dim):
+    return residual(3, dim, dim)
+
+def make_cnv_layer(inp_dim, out_dim):
+    return convolution(3, inp_dim, out_dim)
+
+class kp_module(nn.Module):
+    def __init__(
+        self, n, dims, modules, layer=residual,
+        make_up_layer=make_layer, make_low_layer=make_layer,
+        make_hg_layer=make_layer, make_hg_layer_revr=make_layer_revr,
+        make_pool_layer=make_pool_layer, make_unpool_layer=make_unpool_layer,
+        make_merge_layer=make_merge_layer, **kwargs
+    ):
+        super(kp_module, self).__init__()
+
+        self.n   = n
+
+        curr_mod = modules[0]
+        next_mod = modules[1]
+
+        curr_dim = dims[0]
+        next_dim = dims[1]
+
+        self.up1  = make_up_layer(
+            3, curr_dim, curr_dim, curr_mod, 
+            layer=layer, **kwargs
+        )  
+        self.max1 = make_pool_layer(curr_dim)
+        self.low1 = make_hg_layer(
+            3, curr_dim, next_dim, curr_mod,
+            layer=layer, **kwargs
+        )
+        self.low2 = kp_module(
+            n - 1, dims[1:], modules[1:], layer=layer, 
+            make_up_layer=make_up_layer, 
+            make_low_layer=make_low_layer,
+            make_hg_layer=make_hg_layer,
+            make_hg_layer_revr=make_hg_layer_revr,
+            make_pool_layer=make_pool_layer,
+            make_unpool_layer=make_unpool_layer,
+            make_merge_layer=make_merge_layer,
+            **kwargs
+        ) if self.n > 1 else \
+        make_low_layer(
+            3, next_dim, next_dim, next_mod,
+            layer=layer, **kwargs
+        )
+        self.low3 = make_hg_layer_revr(
+            3, next_dim, curr_dim, curr_mod,
+            layer=layer, **kwargs
+        )
+        self.up2  = make_unpool_layer(curr_dim)
+
+        self.merge = make_merge_layer(curr_dim)
+
+    def forward(self, x):
+        up1  = self.up1(x)
+        max1 = self.max1(x)
+        low1 = self.low1(max1)
+        low2 = self.low2(low1)
+        low3 = self.low3(low2)
+        up2  = self.up2(low3)
+        return self.merge(up1, up2)
+
+class exkp(nn.Module):
+    def __init__(
+        self, n, nstack, dims, modules, heads, pre=None, cnv_dim=256, 
+        make_tl_layer=None, make_br_layer=None,
+        make_cnv_layer=make_cnv_layer, make_heat_layer=make_kp_layer,
+        make_tag_layer=make_kp_layer, make_regr_layer=make_kp_layer,
+        make_up_layer=make_layer, make_low_layer=make_layer, 
+        make_hg_layer=make_layer, make_hg_layer_revr=make_layer_revr,
+        make_pool_layer=make_pool_layer, make_unpool_layer=make_unpool_layer,
+        make_merge_layer=make_merge_layer, make_inter_layer=make_inter_layer, 
+        kp_layer=residual
+    ):
+        super(exkp, self).__init__()
+
+        self.nstack    = nstack
+        self.heads     = heads
+
+        curr_dim = dims[0]
+
+        self.pre = nn.Sequential(
+            convolution(7, 3, 128, stride=2),
+            residual(3, 128, 256, stride=2)
+        ) if pre is None else pre
+
+        self.kps  = nn.ModuleList([
+            kp_module(
+                n, dims, modules, layer=kp_layer,
+                make_up_layer=make_up_layer,
+                make_low_layer=make_low_layer,
+                make_hg_layer=make_hg_layer,
+                make_hg_layer_revr=make_hg_layer_revr,
+                make_pool_layer=make_pool_layer,
+                make_unpool_layer=make_unpool_layer,
+                make_merge_layer=make_merge_layer
+            ) for _ in range(nstack)
+        ])
+        self.cnvs = nn.ModuleList([
+            make_cnv_layer(curr_dim, cnv_dim) for _ in range(nstack)
+        ])
+
+        self.inters = nn.ModuleList([
+            make_inter_layer(curr_dim) for _ in range(nstack - 1)
+        ])
+
+        self.inters_ = nn.ModuleList([
+            nn.Sequential(
+                nn.Conv2d(curr_dim, curr_dim, (1, 1), bias=False),
+                nn.BatchNorm2d(curr_dim)
+            ) for _ in range(nstack - 1)
+        ])
+        self.cnvs_   = nn.ModuleList([
+            nn.Sequential(
+                nn.Conv2d(cnv_dim, curr_dim, (1, 1), bias=False),
+                nn.BatchNorm2d(curr_dim)
+            ) for _ in range(nstack - 1)
+        ])
+
+        ## keypoint heatmaps
+        for head in heads.keys():
+            if 'hm' in head:
+                module =  nn.ModuleList([
+                    make_heat_layer(
+                        cnv_dim, curr_dim, heads[head]) for _ in range(nstack)
+                ])
+                self.__setattr__(head, module)
+                for heat in self.__getattr__(head):
+                    heat[-1].bias.data.fill_(-2.19)
+            else:
+                module = nn.ModuleList([
+                    make_regr_layer(
+                        cnv_dim, curr_dim, heads[head]) for _ in range(nstack)
+                ])
+                self.__setattr__(head, module)
+
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, image):
+        # print('image shape', image.shape)
+        inter = self.pre(image)
+        outs  = []
+
+        for ind in range(self.nstack):
+            kp_, cnv_  = self.kps[ind], self.cnvs[ind]
+            kp  = kp_(inter)
+            cnv = cnv_(kp)
+
+            out = {}
+            for head in self.heads:
+                layer = self.__getattr__(head)[ind]
+                y = layer(cnv)
+                out[head] = y
+            
+            outs.append(out)
+            if ind < self.nstack - 1:
+                inter = self.inters_[ind](inter) + self.cnvs_[ind](cnv)
+                inter = self.relu(inter)
+                inter = self.inters[ind](inter)
+        return outs
+
+
+def make_hg_layer(kernel, dim0, dim1, mod, layer=convolution, **kwargs):
+    layers  = [layer(kernel, dim0, dim1, stride=2)]
+    layers += [layer(kernel, dim1, dim1) for _ in range(mod - 1)]
+    return nn.Sequential(*layers)
+
+
+class HourglassNet(exkp):
+    def __init__(self, heads, num_stacks=2):
+        n       = 5
+        dims    = [256, 256, 384, 384, 384, 512]
+        modules = [2, 2, 2, 2, 2, 4]
+
+        super(HourglassNet, self).__init__(
+            n, num_stacks, dims, modules, heads,
+            make_tl_layer=None,
+            make_br_layer=None,
+            make_pool_layer=make_pool_layer,
+            make_hg_layer=make_hg_layer,
+            kp_layer=residual, cnv_dim=256
+        )
+
+def get_large_hourglass_net(num_layers, heads, head_conv):
+  model = HourglassNet(heads, 2)
+  return model
diff --git a/src/lib/models/networks/msra_resnet.py b/src/lib/models/networks/msra_resnet.py
new file mode 100644
index 0000000..0c59dfc
--- /dev/null
+++ b/src/lib/models/networks/msra_resnet.py
@@ -0,0 +1,280 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# Modified by Xingyi Zhou
+# ------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import torch
+import torch.nn as nn
+import torch.utils.model_zoo as model_zoo
+
+BN_MOMENTUM = 0.1
+
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
+                               bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
+                                  momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class PoseResNet(nn.Module):
+
+    def __init__(self, block, layers, heads, head_conv, **kwargs):
+        self.inplanes = 64
+        self.deconv_with_bias = False
+        self.heads = heads
+
+        super(PoseResNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+
+        # used for deconv layers
+        self.deconv_layers = self._make_deconv_layer(
+            3,
+            [256, 256, 256],
+            [4, 4, 4],
+        )
+        # self.final_layer = []
+
+        for head in sorted(self.heads):
+          num_output = self.heads[head]
+          if head_conv > 0:
+            fc = nn.Sequential(
+                nn.Conv2d(256, head_conv,
+                  kernel_size=3, padding=1, bias=True),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(head_conv, num_output, 
+                  kernel_size=1, stride=1, padding=0))
+          else:
+            fc = nn.Conv2d(
+              in_channels=256,
+              out_channels=num_output,
+              kernel_size=1,
+              stride=1,
+              padding=0
+          )
+          self.__setattr__(head, fc)
+
+        # self.final_layer = nn.ModuleList(self.final_layer)
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _get_deconv_cfg(self, deconv_kernel, index):
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+
+        return deconv_kernel, padding, output_padding
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        assert num_layers == len(num_filters), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+        assert num_layers == len(num_kernels), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i], i)
+
+            planes = num_filters[i]
+            layers.append(
+                nn.ConvTranspose2d(
+                    in_channels=self.inplanes,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=self.deconv_with_bias))
+            layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
+            layers.append(nn.ReLU(inplace=True))
+            self.inplanes = planes
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.deconv_layers(x)
+        ret = {}
+        for head in self.heads:
+            ret[head] = self.__getattr__(head)(x)
+        return [ret]
+
+    def init_weights(self, num_layers, pretrained=True):
+        if pretrained:
+            # print('=> init resnet deconv weights from normal distribution')
+            for _, m in self.deconv_layers.named_modules():
+                if isinstance(m, nn.ConvTranspose2d):
+                    # print('=> init {}.weight as normal(0, 0.001)'.format(name))
+                    # print('=> init {}.bias as 0'.format(name))
+                    nn.init.normal_(m.weight, std=0.001)
+                    if self.deconv_with_bias:
+                        nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.BatchNorm2d):
+                    # print('=> init {}.weight as 1'.format(name))
+                    # print('=> init {}.bias as 0'.format(name))
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+            # print('=> init final conv weights from normal distribution')
+            for head in self.heads:
+              final_layer = self.__getattr__(head)
+              for i, m in enumerate(final_layer.modules()):
+                  if isinstance(m, nn.Conv2d):
+                      # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                      # print('=> init {}.weight as normal(0, 0.001)'.format(name))
+                      # print('=> init {}.bias as 0'.format(name))
+                      if m.weight.shape[0] == self.heads[head]:
+                          if 'hm' in head:
+                              nn.init.constant_(m.bias, -2.19)
+                          else:
+                              nn.init.normal_(m.weight, std=0.001)
+                              nn.init.constant_(m.bias, 0)
+            #pretrained_state_dict = torch.load(pretrained)
+            url = model_urls['resnet{}'.format(num_layers)]
+            pretrained_state_dict = model_zoo.load_url(url)
+            print('=> loading pretrained model {}'.format(url))
+            self.load_state_dict(pretrained_state_dict, strict=False)
+        else:
+            print('=> imagenet pretrained model dose not exist')
+            print('=> please download it first')
+            raise ValueError('imagenet pretrained model does not exist')
+
+
+resnet_spec = {18: (BasicBlock, [2, 2, 2, 2]),
+               34: (BasicBlock, [3, 4, 6, 3]),
+               50: (Bottleneck, [3, 4, 6, 3]),
+               101: (Bottleneck, [3, 4, 23, 3]),
+               152: (Bottleneck, [3, 8, 36, 3])}
+
+
+def get_pose_net(num_layers, heads, head_conv):
+  block_class, layers = resnet_spec[num_layers]
+
+  model = PoseResNet(block_class, layers, heads, head_conv=head_conv)
+  model.init_weights(num_layers, pretrained=True)
+  return model
diff --git a/src/lib/models/networks/pose_dla_dcn.py b/src/lib/models/networks/pose_dla_dcn.py
new file mode 100644
index 0000000..7cb6869
--- /dev/null
+++ b/src/lib/models/networks/pose_dla_dcn.py
@@ -0,0 +1,493 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import math
+import logging
+import numpy as np
+from os.path import join
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.model_zoo as model_zoo
+
+from .DCNv2.dcn_v2 import DCN
+
+BN_MOMENTUM = 0.1
+logger = logging.getLogger(__name__)
+
+def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'):
+    return join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash))
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3,
+                               stride=stride, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
+                               stride=1, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 2
+
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(Bottleneck, self).__init__()
+        expansion = Bottleneck.expansion
+        bottle_planes = planes // expansion
+        self.conv1 = nn.Conv2d(inplanes, bottle_planes,
+                               kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
+                               stride=stride, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(bottle_planes, planes,
+                               kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class BottleneckX(nn.Module):
+    expansion = 2
+    cardinality = 32
+
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(BottleneckX, self).__init__()
+        cardinality = BottleneckX.cardinality
+        # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0)))
+        # bottle_planes = dim * cardinality
+        bottle_planes = planes * cardinality // 32
+        self.conv1 = nn.Conv2d(inplanes, bottle_planes,
+                               kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
+                               stride=stride, padding=dilation, bias=False,
+                               dilation=dilation, groups=cardinality)
+        self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(bottle_planes, planes,
+                               kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Root(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, residual):
+        super(Root, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels, out_channels, 1,
+            stride=1, bias=False, padding=(kernel_size - 1) // 2)
+        self.bn = nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.residual = residual
+
+    def forward(self, *x):
+        children = x
+        x = self.conv(torch.cat(x, 1))
+        x = self.bn(x)
+        if self.residual:
+            x += children[0]
+        x = self.relu(x)
+
+        return x
+
+
+class Tree(nn.Module):
+    def __init__(self, levels, block, in_channels, out_channels, stride=1,
+                 level_root=False, root_dim=0, root_kernel_size=1,
+                 dilation=1, root_residual=False):
+        super(Tree, self).__init__()
+        if root_dim == 0:
+            root_dim = 2 * out_channels
+        if level_root:
+            root_dim += in_channels
+        if levels == 1:
+            self.tree1 = block(in_channels, out_channels, stride,
+                               dilation=dilation)
+            self.tree2 = block(out_channels, out_channels, 1,
+                               dilation=dilation)
+        else:
+            self.tree1 = Tree(levels - 1, block, in_channels, out_channels,
+                              stride, root_dim=0,
+                              root_kernel_size=root_kernel_size,
+                              dilation=dilation, root_residual=root_residual)
+            self.tree2 = Tree(levels - 1, block, out_channels, out_channels,
+                              root_dim=root_dim + out_channels,
+                              root_kernel_size=root_kernel_size,
+                              dilation=dilation, root_residual=root_residual)
+        if levels == 1:
+            self.root = Root(root_dim, out_channels, root_kernel_size,
+                             root_residual)
+        self.level_root = level_root
+        self.root_dim = root_dim
+        self.downsample = None
+        self.project = None
+        self.levels = levels
+        if stride > 1:
+            self.downsample = nn.MaxPool2d(stride, stride=stride)
+        if in_channels != out_channels:
+            self.project = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels,
+                          kernel_size=1, stride=1, bias=False),
+                nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM)
+            )
+
+    def forward(self, x, residual=None, children=None):
+        children = [] if children is None else children
+        bottom = self.downsample(x) if self.downsample else x
+        residual = self.project(bottom) if self.project else bottom
+        if self.level_root:
+            children.append(bottom)
+        x1 = self.tree1(x, residual)
+        if self.levels == 1:
+            x2 = self.tree2(x1)
+            x = self.root(x2, x1, *children)
+        else:
+            children.append(x1)
+            x = self.tree2(x1, children=children)
+        return x
+
+
+class DLA(nn.Module):
+    def __init__(self, levels, channels, num_classes=1000,
+                 block=BasicBlock, residual_root=False, linear_root=False):
+        super(DLA, self).__init__()
+        self.channels = channels
+        self.num_classes = num_classes
+        self.base_layer = nn.Sequential(
+            nn.Conv2d(3, channels[0], kernel_size=7, stride=1,
+                      padding=3, bias=False),
+            nn.BatchNorm2d(channels[0], momentum=BN_MOMENTUM),
+            nn.ReLU(inplace=True))
+        self.level0 = self._make_conv_level(
+            channels[0], channels[0], levels[0])
+        self.level1 = self._make_conv_level(
+            channels[0], channels[1], levels[1], stride=2)
+        self.level2 = Tree(levels[2], block, channels[1], channels[2], 2,
+                           level_root=False,
+                           root_residual=residual_root)
+        self.level3 = Tree(levels[3], block, channels[2], channels[3], 2,
+                           level_root=True, root_residual=residual_root)
+        self.level4 = Tree(levels[4], block, channels[3], channels[4], 2,
+                           level_root=True, root_residual=residual_root)
+        self.level5 = Tree(levels[5], block, channels[4], channels[5], 2,
+                           level_root=True, root_residual=residual_root)
+
+        # for m in self.modules():
+        #     if isinstance(m, nn.Conv2d):
+        #         n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        #         m.weight.data.normal_(0, math.sqrt(2. / n))
+        #     elif isinstance(m, nn.BatchNorm2d):
+        #         m.weight.data.fill_(1)
+        #         m.bias.data.zero_()
+
+    def _make_level(self, block, inplanes, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes:
+            downsample = nn.Sequential(
+                nn.MaxPool2d(stride, stride=stride),
+                nn.Conv2d(inplanes, planes,
+                          kernel_size=1, stride=1, bias=False),
+                nn.BatchNorm2d(planes, momentum=BN_MOMENTUM),
+            )
+
+        layers = []
+        layers.append(block(inplanes, planes, stride, downsample=downsample))
+        for i in range(1, blocks):
+            layers.append(block(inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
+        modules = []
+        for i in range(convs):
+            modules.extend([
+                nn.Conv2d(inplanes, planes, kernel_size=3,
+                          stride=stride if i == 0 else 1,
+                          padding=dilation, bias=False, dilation=dilation),
+                nn.BatchNorm2d(planes, momentum=BN_MOMENTUM),
+                nn.ReLU(inplace=True)])
+            inplanes = planes
+        return nn.Sequential(*modules)
+
+    def forward(self, x):
+        y = []
+        x = self.base_layer(x)
+        for i in range(6):
+            x = getattr(self, 'level{}'.format(i))(x)
+            y.append(x)
+        return y
+
+    def load_pretrained_model(self, data='imagenet', name='dla34', hash='ba72cf86'):
+        # fc = self.fc
+        if name.endswith('.pth'):
+            model_weights = torch.load(data + name)
+        else:
+            model_url = get_model_url(data, name, hash)
+            model_weights = model_zoo.load_url(model_url)
+        num_classes = len(model_weights[list(model_weights.keys())[-1]])
+        self.fc = nn.Conv2d(
+            self.channels[-1], num_classes,
+            kernel_size=1, stride=1, padding=0, bias=True)
+        self.load_state_dict(model_weights)
+        # self.fc = fc
+
+
+def dla34(pretrained=True, **kwargs):  # DLA-34
+    model = DLA([1, 1, 1, 2, 2, 1],
+                [16, 32, 64, 128, 256, 512],
+                block=BasicBlock, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla34', hash='ba72cf86')
+    return model
+
+class Identity(nn.Module):
+
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+def fill_fc_weights(layers):
+    for m in layers.modules():
+        if isinstance(m, nn.Conv2d):
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+
+def fill_up_weights(up):
+    w = up.weight.data
+    f = math.ceil(w.size(2) / 2)
+    c = (2 * f - 1 - f % 2) / (2. * f)
+    for i in range(w.size(2)):
+        for j in range(w.size(3)):
+            w[0, 0, i, j] = \
+                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
+    for c in range(1, w.size(0)):
+        w[c, 0, :, :] = w[0, 0, :, :]
+
+
+class DeformConv(nn.Module):
+    def __init__(self, chi, cho):
+        super(DeformConv, self).__init__()
+        self.actf = nn.Sequential(
+            nn.BatchNorm2d(cho, momentum=BN_MOMENTUM),
+            nn.ReLU(inplace=True)
+        )
+        self.conv = DCN(chi, cho, kernel_size=(3,3), stride=1, padding=1, dilation=1, deformable_groups=1)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.actf(x)
+        return x
+
+
+class IDAUp(nn.Module):
+
+    def __init__(self, o, channels, up_f):
+        super(IDAUp, self).__init__()
+        for i in range(1, len(channels)):
+            c = channels[i]
+            f = int(up_f[i])  
+            proj = DeformConv(c, o)
+            node = DeformConv(o, o)
+     
+            up = nn.ConvTranspose2d(o, o, f * 2, stride=f, 
+                                    padding=f // 2, output_padding=0,
+                                    groups=o, bias=False)
+            fill_up_weights(up)
+
+            setattr(self, 'proj_' + str(i), proj)
+            setattr(self, 'up_' + str(i), up)
+            setattr(self, 'node_' + str(i), node)
+                 
+        
+    def forward(self, layers, startp, endp):
+        for i in range(startp + 1, endp):
+            upsample = getattr(self, 'up_' + str(i - startp))
+            project = getattr(self, 'proj_' + str(i - startp))
+            layers[i] = upsample(project(layers[i]))
+            node = getattr(self, 'node_' + str(i - startp))
+            layers[i] = node(layers[i] + layers[i - 1])
+
+
+
+class DLAUp(nn.Module):
+    def __init__(self, startp, channels, scales, in_channels=None):
+        super(DLAUp, self).__init__()
+        self.startp = startp
+        if in_channels is None:
+            in_channels = channels
+        self.channels = channels
+        channels = list(channels)
+        scales = np.array(scales, dtype=int)
+        for i in range(len(channels) - 1):
+            j = -i - 2
+            setattr(self, 'ida_{}'.format(i),
+                    IDAUp(channels[j], in_channels[j:],
+                          scales[j:] // scales[j]))
+            scales[j + 1:] = scales[j]
+            in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]]
+
+    def forward(self, layers):
+        out = [layers[-1]] # start with 32
+        for i in range(len(layers) - self.startp - 1):
+            ida = getattr(self, 'ida_{}'.format(i))
+            ida(layers, len(layers) -i - 2, len(layers))
+            out.insert(0, layers[-1])
+        return out
+
+
+class Interpolate(nn.Module):
+    def __init__(self, scale, mode):
+        super(Interpolate, self).__init__()
+        self.scale = scale
+        self.mode = mode
+        
+    def forward(self, x):
+        x = F.interpolate(x, scale_factor=self.scale, mode=self.mode, align_corners=False)
+        return x
+
+
+class DLASeg(nn.Module):
+    def __init__(self, base_name, heads, pretrained, down_ratio, final_kernel,
+                 last_level, head_conv, out_channel=0):
+        super(DLASeg, self).__init__()
+        assert down_ratio in [2, 4, 8, 16]
+        self.first_level = int(np.log2(down_ratio))
+        self.last_level = last_level
+        self.base = globals()[base_name](pretrained=pretrained)
+        channels = self.base.channels
+        scales = [2 ** i for i in range(len(channels[self.first_level:]))]
+        self.dla_up = DLAUp(self.first_level, channels[self.first_level:], scales)
+
+        if out_channel == 0:
+            out_channel = channels[self.first_level]
+
+        self.ida_up = IDAUp(out_channel, channels[self.first_level:self.last_level], 
+                            [2 ** i for i in range(self.last_level - self.first_level)])
+        
+        self.heads = heads
+        for head in self.heads:
+            classes = self.heads[head]
+            if head_conv > 0:
+              fc = nn.Sequential(
+                  nn.Conv2d(channels[self.first_level], head_conv,
+                    kernel_size=3, padding=1, bias=True),
+                  nn.ReLU(inplace=True),
+                  nn.Conv2d(head_conv, classes, 
+                    kernel_size=final_kernel, stride=1, 
+                    padding=final_kernel // 2, bias=True))
+              if 'hm' in head:
+                fc[-1].bias.data.fill_(-2.19)
+              else:
+                fill_fc_weights(fc)
+            else:
+              fc = nn.Conv2d(channels[self.first_level], classes, 
+                  kernel_size=final_kernel, stride=1, 
+                  padding=final_kernel // 2, bias=True)
+              if 'hm' in head:
+                fc.bias.data.fill_(-2.19)
+              else:
+                fill_fc_weights(fc)
+            self.__setattr__(head, fc)
+
+    def forward(self, x):
+        x = self.base(x)
+        x = self.dla_up(x)
+
+        y = []
+        for i in range(self.last_level - self.first_level):
+            y.append(x[i].clone())
+        self.ida_up(y, 0, len(y))
+
+        z = {}
+        for head in self.heads:
+            z[head] = self.__getattr__(head)(y[-1])
+        return [z]
+    
+
+def get_pose_net(num_layers, heads, head_conv=256, down_ratio=4):
+  model = DLASeg('dla{}'.format(num_layers), heads,
+                 pretrained=True,
+                 down_ratio=down_ratio,
+                 final_kernel=1,
+                 last_level=5,
+                 head_conv=head_conv)
+  return model
+
diff --git a/src/lib/models/networks/resnet_dcn.py b/src/lib/models/networks/resnet_dcn.py
new file mode 100644
index 0000000..805c2c3
--- /dev/null
+++ b/src/lib/models/networks/resnet_dcn.py
@@ -0,0 +1,290 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# Modified by Dequan Wang and Xingyi Zhou
+# ------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import math
+import logging
+
+import torch
+import torch.nn as nn
+from .DCNv2.dcn_v2 import DCN
+import torch.utils.model_zoo as model_zoo
+
+BN_MOMENTUM = 0.1
+logger = logging.getLogger(__name__)
+
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1,
+                               bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion,
+                                  momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+def fill_up_weights(up):
+    w = up.weight.data
+    f = math.ceil(w.size(2) / 2)
+    c = (2 * f - 1 - f % 2) / (2. * f)
+    for i in range(w.size(2)):
+        for j in range(w.size(3)):
+            w[0, 0, i, j] = \
+                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
+    for c in range(1, w.size(0)):
+        w[c, 0, :, :] = w[0, 0, :, :] 
+
+def fill_fc_weights(layers):
+    for m in layers.modules():
+        if isinstance(m, nn.Conv2d):
+            nn.init.normal_(m.weight, std=0.001)
+            # torch.nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu')
+            # torch.nn.init.xavier_normal_(m.weight.data)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+class PoseResNet(nn.Module):
+
+    def __init__(self, block, layers, heads, head_conv):
+        self.inplanes = 64
+        self.heads = heads
+        self.deconv_with_bias = False
+
+        super(PoseResNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+
+        # used for deconv layers
+        self.deconv_layers = self._make_deconv_layer(
+            3,
+            [256, 128, 64],
+            [4, 4, 4],
+        )
+
+        for head in self.heads:
+            classes = self.heads[head]
+            if head_conv > 0:
+                fc = nn.Sequential(
+                  nn.Conv2d(64, head_conv,
+                    kernel_size=3, padding=1, bias=True),
+                  nn.ReLU(inplace=True),
+                  nn.Conv2d(head_conv, classes, 
+                    kernel_size=1, stride=1, 
+                    padding=0, bias=True))
+                if 'hm' in head:
+                    fc[-1].bias.data.fill_(-2.19)
+                else:
+                    fill_fc_weights(fc)
+            else:
+                fc = nn.Conv2d(64, classes, 
+                  kernel_size=1, stride=1, 
+                  padding=0, bias=True)
+                if 'hm' in head:
+                    fc.bias.data.fill_(-2.19)
+                else:
+                    fill_fc_weights(fc)
+            self.__setattr__(head, fc)
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _get_deconv_cfg(self, deconv_kernel, index):
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+
+        return deconv_kernel, padding, output_padding
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        assert num_layers == len(num_filters), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+        assert num_layers == len(num_kernels), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i], i)
+
+            planes = num_filters[i]
+            fc = DCN(self.inplanes, planes, 
+                    kernel_size=(3,3), stride=1,
+                    padding=1, dilation=1, deformable_groups=1)
+            # fc = nn.Conv2d(self.inplanes, planes,
+            #         kernel_size=3, stride=1, 
+            #         padding=1, dilation=1, bias=False)
+            # fill_fc_weights(fc)
+            up = nn.ConvTranspose2d(
+                    in_channels=planes,
+                    out_channels=planes,
+                    kernel_size=kernel,
+                    stride=2,
+                    padding=padding,
+                    output_padding=output_padding,
+                    bias=self.deconv_with_bias)
+            fill_up_weights(up)
+
+            layers.append(fc)
+            layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
+            layers.append(nn.ReLU(inplace=True))
+            layers.append(up)
+            layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM))
+            layers.append(nn.ReLU(inplace=True))
+            self.inplanes = planes
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.deconv_layers(x)
+        ret = {}
+        for head in self.heads:
+            ret[head] = self.__getattr__(head)(x)
+        return [ret]
+
+    def init_weights(self, num_layers):
+        if 1:
+            url = model_urls['resnet{}'.format(num_layers)]
+            pretrained_state_dict = model_zoo.load_url(url)
+            print('=> loading pretrained model {}'.format(url))
+            self.load_state_dict(pretrained_state_dict, strict=False)
+            print('=> init deconv weights from normal distribution')
+            for name, m in self.deconv_layers.named_modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+
+
+resnet_spec = {18: (BasicBlock, [2, 2, 2, 2]),
+               34: (BasicBlock, [3, 4, 6, 3]),
+               50: (Bottleneck, [3, 4, 6, 3]),
+               101: (Bottleneck, [3, 4, 23, 3]),
+               152: (Bottleneck, [3, 8, 36, 3])}
+
+
+def get_pose_net(num_layers, heads, head_conv=256):
+  block_class, layers = resnet_spec[num_layers]
+
+  model = PoseResNet(block_class, layers, heads, head_conv=head_conv)
+  model.init_weights(num_layers)
+  return model
diff --git a/src/lib/models/scatter_gather.py b/src/lib/models/scatter_gather.py
new file mode 100644
index 0000000..9a46058
--- /dev/null
+++ b/src/lib/models/scatter_gather.py
@@ -0,0 +1,38 @@
+import torch
+from torch.autograd import Variable
+from torch.nn.parallel._functions import Scatter, Gather
+
+
+def scatter(inputs, target_gpus, dim=0, chunk_sizes=None):
+    r"""
+    Slices variables into approximately equal chunks and
+    distributes them across given GPUs. Duplicates
+    references to objects that are not variables. Does not
+    support Tensors.
+    """
+    def scatter_map(obj):
+        if isinstance(obj, Variable):
+            return Scatter.apply(target_gpus, chunk_sizes, dim, obj)
+        assert not torch.is_tensor(obj), "Tensors not supported in scatter."
+        if isinstance(obj, tuple):
+            return list(zip(*map(scatter_map, obj)))
+        if isinstance(obj, list):
+            return list(map(list, zip(*map(scatter_map, obj))))
+        if isinstance(obj, dict):
+            return list(map(type(obj), zip(*map(scatter_map, obj.items()))))
+        return [obj for targets in target_gpus]
+
+    return scatter_map(inputs)
+
+
+def scatter_kwargs(inputs, kwargs, target_gpus, dim=0, chunk_sizes=None):
+    r"""Scatter with support for kwargs dictionary"""
+    inputs = scatter(inputs, target_gpus, dim, chunk_sizes) if inputs else []
+    kwargs = scatter(kwargs, target_gpus, dim, chunk_sizes) if kwargs else []
+    if len(inputs) < len(kwargs):
+        inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
+    elif len(kwargs) < len(inputs):
+        kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
+    inputs = tuple(inputs)
+    kwargs = tuple(kwargs)
+    return inputs, kwargs
diff --git a/src/lib/models/utils.py b/src/lib/models/utils.py
new file mode 100644
index 0000000..ee16add
--- /dev/null
+++ b/src/lib/models/utils.py
@@ -0,0 +1,56 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import torch.nn as nn
+
+
+def _sigmoid(x):
+    y = torch.clamp(x.sigmoid_(), min=1e-4, max=1-1e-4)
+    return y
+
+
+def _gather_feat(feat, ind, mask=None):
+    dim = feat.size(2)
+    ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim)
+    feat = feat.gather(1, ind)
+    if mask is not None:
+        mask = mask.unsqueeze(2).expand_as(feat)
+        feat = feat[mask]
+        feat = feat.view(-1, dim)
+    return feat
+
+
+def _transpose_and_gather_feat(feat, ind):
+    feat = feat.permute(0, 2, 3, 1).contiguous()
+    feat = feat.view(feat.size(0), -1, feat.size(3))
+    feat = _gather_feat(feat, ind)
+    return feat
+
+
+def flip_tensor(x):
+    return torch.flip(x, [3])
+    # tmp = x.detach().cpu().numpy()[..., ::-1].copy()
+    # return torch.from_numpy(tmp).to(x.device)
+
+
+def flip_lr(x, flip_idx):
+    tmp = x.detach().cpu().numpy()[..., ::-1].copy()
+    shape = tmp.shape
+    for e in flip_idx:
+        tmp[:, e[0], ...], tmp[:, e[1], ...] = \
+            tmp[:, e[1], ...].copy(), tmp[:, e[0], ...].copy()
+    return torch.from_numpy(tmp.reshape(shape)).to(x.device)
+
+
+def flip_lr_off(x, flip_idx):
+    tmp = x.detach().cpu().numpy()[..., ::-1].copy()
+    shape = tmp.shape
+    tmp = tmp.reshape(tmp.shape[0], 17, 2,
+                      tmp.shape[2], tmp.shape[3])
+    tmp[:, :, 0, :, :] *= -1
+    for e in flip_idx:
+        tmp[:, e[0], ...], tmp[:, e[1], ...] = \
+            tmp[:, e[1], ...].copy(), tmp[:, e[0], ...].copy()
+    return torch.from_numpy(tmp.reshape(shape)).to(x.device)
diff --git a/src/lib/opts.py b/src/lib/opts.py
new file mode 100755
index 0000000..4729a81
--- /dev/null
+++ b/src/lib/opts.py
@@ -0,0 +1,383 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import sys
+
+
+class opts(object):
+    def __init__(self):
+        self.parser = argparse.ArgumentParser()
+        # basic experiment setting
+        self.parser.add_argument('task', default='cetseg',
+                                 help='ctdet | ddd | multi_pose | exdet |cetseg')
+        self.parser.add_argument('--dataset', default='coco',
+                                 help='coco | kitti | coco_hp | pascal')
+        self.parser.add_argument('--exp_id', default='default')
+        self.parser.add_argument('--test', action='store_true')
+        self.parser.add_argument('--debug', type=int, default=0,
+                                 help='level of visualization.'
+                                      '1: only show the final detection results'
+                                      '2: show the network output features'
+                                      '3: use matplot to display'  # useful when lunching training with ipython notebook
+                                      '4: save all visualizations to disk')
+        self.parser.add_argument('--demo', default='',
+                                 help='path to image/ image folders/ video. '
+                                      'or "webcam"')
+        self.parser.add_argument('--load_model', default='',
+                                 help='path to pretrained model')
+        self.parser.add_argument('--resume', action='store_true',
+                                 help='resume an experiment. '
+                                      'Reloaded the optimizer parameter and '
+                                      'set load_model to model_last.pth '
+                                      'in the exp dir if load_model is empty.')
+
+        # system
+        self.parser.add_argument('--gpus', default='-1',
+                                 help='-1 for CPU, use comma for multiple gpus')
+        self.parser.add_argument('--num_workers', type=int, default=4,
+                                 help='dataloader threads. 0 for single-thread.')
+        self.parser.add_argument('--not_cuda_benchmark', action='store_true',
+                                 help='disable when the input size is not fixed.')
+        self.parser.add_argument('--seed', type=int, default=317,
+                                 help='random seed')  # from CornerNet
+
+        # log
+        self.parser.add_argument('--print_iter', type=int, default=0,
+                                 help='disable progress bar and print to screen.')
+        self.parser.add_argument('--hide_data_time', action='store_true',
+                                 help='not display time during training.')
+        self.parser.add_argument('--save_all', action='store_true',
+                                 help='save model to disk every 5 epochs.')
+        self.parser.add_argument('--metric', default='loss',
+                                 help='main metric to save best model')
+        self.parser.add_argument('--vis_thresh', type=float, default=0.3,
+                                 help='visualization threshold.')
+        self.parser.add_argument('--debugger_theme', default='white',
+                                 choices=['white', 'black'])
+
+        # model
+        self.parser.add_argument('--arch', default='dla_34',
+                                 help='model architecture. Currently tested'
+                                      'res_18 | res_101 | resdcn_18 | resdcn_101 |'
+                                      'dlav0_34 | dla_34 | hourglass')
+        self.parser.add_argument('--head_conv', type=int, default=-1,
+                                 help='conv layer channels for output head'
+                                      '0 for no conv layer'
+                                      '-1 for default setting: '
+                                      '64 for resnets and 256 for dla.')
+        self.parser.add_argument('--down_ratio', type=int, default=4,
+                                 help='output stride. Currently only supports 4.')
+
+        # input
+        self.parser.add_argument('--input_res', type=int, default=-1,
+                                 help='input height and width. -1 for default from '
+                                 'dataset. Will be overriden by input_h | input_w')
+        self.parser.add_argument('--input_h', type=int, default=-1,
+                                 help='input height. -1 for default from dataset.')
+        self.parser.add_argument('--input_w', type=int, default=-1,
+                                 help='input width. -1 for default from dataset.')
+
+        # train
+        self.parser.add_argument('--lr', type=float, default=1.25e-4,
+                                 help='learning rate for batch size 32.')
+        self.parser.add_argument('--lr_step', type=str, default='90,120',
+                                 help='drop learning rate by 10.')
+        self.parser.add_argument('--num_epochs', type=int, default=140,
+                                 help='total training epochs.')
+        self.parser.add_argument('--batch_size', type=int, default=32,
+                                 help='batch size')
+        self.parser.add_argument('--master_batch_size', type=int, default=-1,
+                                 help='batch size on the master gpu.')
+        self.parser.add_argument('--num_iters', type=int, default=-1,
+                                 help='default: #samples / batch_size.')
+        self.parser.add_argument('--val_intervals', type=int, default=5,
+                                 help='number of epochs to run validation.')
+        self.parser.add_argument('--trainval', action='store_true',
+                                 help='include validation in training and '
+                                      'test on test set')
+
+        # test
+        self.parser.add_argument('--flip_test', action='store_true',
+                                 help='flip data augmentation.')
+        self.parser.add_argument('--test_scales', type=str, default='1',
+                                 help='multi scale test augmentation.')
+        self.parser.add_argument('--nms', action='store_true',
+                                 help='run nms in testing.')
+        self.parser.add_argument('--K', type=int, default=100,
+                                 help='max number of output objects.')
+        self.parser.add_argument('--not_prefetch_test', action='store_true',
+                                 help='not use parallal data pre-processing.')
+        self.parser.add_argument('--fix_res', action='store_true',
+                                 help='fix testing resolution or keep '
+                                      'the original resolution')
+        self.parser.add_argument('--keep_res', action='store_true',
+                                 help='keep the original resolution'
+                                      ' during validation.')
+
+        # dataset
+        self.parser.add_argument('--not_rand_crop', action='store_true',
+                                 help='not use the random crop data augmentation'
+                                      'from CornerNet.')
+        self.parser.add_argument('--shift', type=float, default=0.1,
+                                 help='when not using random crop'
+                                      'apply shift augmentation.')
+        self.parser.add_argument('--scale', type=float, default=0.4,
+                                 help='when not using random crop'
+                                      'apply scale augmentation.')
+        self.parser.add_argument('--rotate', type=float, default=0,
+                                 help='when not using random crop'
+                                      'apply rotation augmentation.')
+        self.parser.add_argument('--flip', type=float, default=0.5,
+                                 help='probability of applying flip augmentation.')
+        self.parser.add_argument('--no_color_aug', action='store_true',
+                                 help='not use the color augmenation '
+                                      'from CornerNet')
+        # multi_pose
+        self.parser.add_argument('--aug_rot', type=float, default=0,
+                                 help='probability of applying '
+                                      'rotation augmentation.')
+        # ddd
+        self.parser.add_argument('--aug_ddd', type=float, default=0.5,
+                                 help='probability of applying crop augmentation.')
+        self.parser.add_argument('--rect_mask', action='store_true',
+                                 help='for ignored object, apply mask on the '
+                                      'rectangular region or just center point.')
+        self.parser.add_argument('--kitti_split', default='3dop',
+                                 help='different validation split for kitti: '
+                                      '3dop | subcnn')
+
+        # loss
+        self.parser.add_argument('--mse_loss', action='store_true',
+                                 help='use mse loss or focal loss to train '
+                                      'keypoint heatmaps.')
+        # ctdet
+        self.parser.add_argument('--reg_loss', default='l1',
+                                 help='regression loss: sl1 | l1 | l2')
+        self.parser.add_argument('--hm_weight', type=float, default=1,
+                                 help='loss weight for keypoint heatmaps.')
+        self.parser.add_argument('--off_weight', type=float, default=1,
+                                 help='loss weight for keypoint local offsets.')
+        self.parser.add_argument('--wh_weight', type=float, default=0.1,
+                                 help='loss weight for bounding box size.')
+        # multi_pose
+        self.parser.add_argument('--hp_weight', type=float, default=1,
+                                 help='loss weight for human pose offset.')
+        self.parser.add_argument('--hm_hp_weight', type=float, default=1,
+                                 help='loss weight for human keypoint heatmap.')
+        # ddd
+        self.parser.add_argument('--dep_weight', type=float, default=1,
+                                 help='loss weight for depth.')
+        self.parser.add_argument('--dim_weight', type=float, default=1,
+                                 help='loss weight for 3d bounding box size.')
+        self.parser.add_argument('--rot_weight', type=float, default=1,
+                                 help='loss weight for orientation.')
+        self.parser.add_argument('--peak_thresh', type=float, default=0.2)
+
+        # task
+        # ctdet
+        self.parser.add_argument('--norm_wh', action='store_true',
+                                 help='L1(\hat(y) / y, 1) or L1(\hat(y), y)')
+        self.parser.add_argument('--dense_wh', action='store_true',
+                                 help='apply weighted regression near center or '
+                                      'just apply regression on center point.')
+        self.parser.add_argument('--cat_spec_wh', action='store_true',
+                                 help='category specific bounding box size.')
+        self.parser.add_argument('--not_reg_offset', action='store_true',
+                                 help='not regress local offset.')
+        # ctseg
+        self.parser.add_argument('--seg_feat_channel', default=8, type=int,
+                                 help='.')
+        self.parser.add_argument('--seg_weight', default=1., type=float,
+                                 help='')
+        # exdet
+        self.parser.add_argument('--agnostic_ex', action='store_true',
+                                 help='use category agnostic extreme points.')
+        self.parser.add_argument('--scores_thresh', type=float, default=0.1,
+                                 help='threshold for extreme point heatmap.')
+        self.parser.add_argument('--center_thresh', type=float, default=0.1,
+                                 help='threshold for centermap.')
+        self.parser.add_argument('--aggr_weight', type=float, default=0.0,
+                                 help='edge aggregation weight.')
+        # multi_pose
+        self.parser.add_argument('--dense_hp', action='store_true',
+                                 help='apply weighted pose regression near center '
+                                      'or just apply regression on center point.')
+        self.parser.add_argument('--not_hm_hp', action='store_true',
+                                 help='not estimate human joint heatmap, '
+                                      'directly use the joint offset from center.')
+        self.parser.add_argument('--not_reg_hp_offset', action='store_true',
+                                 help='not regress local offset for '
+                                      'human joint heatmaps.')
+        self.parser.add_argument('--not_reg_bbox', action='store_true',
+                                 help='not regression bounding box size.')
+
+        # ground truth validation
+        self.parser.add_argument('--eval_oracle_hm', action='store_true',
+                                 help='use ground center heatmap.')
+        self.parser.add_argument('--eval_oracle_wh', action='store_true',
+                                 help='use ground truth bounding box size.')
+        self.parser.add_argument('--eval_oracle_offset', action='store_true',
+                                 help='use ground truth local heatmap offset.')
+        self.parser.add_argument('--eval_oracle_kps', action='store_true',
+                                 help='use ground truth human pose offset.')
+        self.parser.add_argument('--eval_oracle_hmhp', action='store_true',
+                                 help='use ground truth human joint heatmaps.')
+        self.parser.add_argument('--eval_oracle_hp_offset', action='store_true',
+                                 help='use ground truth human joint local offset.')
+        self.parser.add_argument('--eval_oracle_dep', action='store_true',
+                                 help='use ground truth depth.')
+
+    def parse(self, args=''):
+        if args == '':
+            opt = self.parser.parse_args()
+        else:
+            opt = self.parser.parse_args(args)
+
+        opt.gpus_str = opt.gpus
+        opt.gpus = [int(gpu) for gpu in opt.gpus.split(',')]
+        opt.gpus = [i for i in range(
+            len(opt.gpus))] if opt.gpus[0] >= 0 else [-1]
+        opt.lr_step = [int(i) for i in opt.lr_step.split(',')]
+        opt.test_scales = [float(i) for i in opt.test_scales.split(',')]
+
+        opt.fix_res = not opt.keep_res
+        print('Fix size testing.' if opt.fix_res else 'Keep resolution testing.')
+        opt.reg_offset = not opt.not_reg_offset
+        opt.reg_bbox = not opt.not_reg_bbox
+        opt.hm_hp = not opt.not_hm_hp
+        opt.reg_hp_offset = (not opt.not_reg_hp_offset) and opt.hm_hp
+
+        if opt.head_conv == -1:  # init default head_conv
+            opt.head_conv = 256 if 'dla' in opt.arch else 64
+        opt.pad = 127 if 'hourglass' in opt.arch else 31
+        opt.num_stacks = 2 if opt.arch == 'hourglass' else 1
+
+        if opt.trainval:
+            opt.val_intervals = 100000000
+
+        if opt.debug > 0:
+            opt.num_workers = 0
+            opt.batch_size = 1
+            opt.gpus = [opt.gpus[0]]
+            opt.master_batch_size = -1
+
+        if opt.master_batch_size == -1:
+            opt.master_batch_size = opt.batch_size // len(opt.gpus)
+        rest_batch_size = (opt.batch_size - opt.master_batch_size)
+        opt.chunk_sizes = [opt.master_batch_size]
+        for i in range(len(opt.gpus) - 1):
+            slave_chunk_size = rest_batch_size // (len(opt.gpus) - 1)
+            if i < rest_batch_size % (len(opt.gpus) - 1):
+                slave_chunk_size += 1
+            opt.chunk_sizes.append(slave_chunk_size)
+        print('training chunk_sizes:', opt.chunk_sizes)
+
+        opt.root_dir = os.path.join(os.path.dirname(__file__), '..', '..')
+        opt.data_dir = os.path.join(opt.root_dir, 'data')
+        opt.exp_dir = os.path.join(opt.root_dir, 'exp', opt.task)
+        opt.save_dir = os.path.join(opt.exp_dir, opt.exp_id)
+        opt.debug_dir = os.path.join(opt.save_dir, 'debug')
+        print('The output will be saved to ', opt.save_dir)
+
+        if opt.resume and opt.load_model == '':
+            model_path = opt.save_dir[:-4] if opt.save_dir.endswith('TEST') \
+                else opt.save_dir
+            opt.load_model = os.path.join(model_path, 'model_last.pth')
+        return opt
+
+    def update_dataset_info_and_set_heads(self, opt, dataset):
+        input_h, input_w = dataset.default_resolution
+        opt.mean, opt.std = dataset.mean, dataset.std
+        opt.num_classes = dataset.num_classes
+
+        # input_h(w): opt.input_h overrides opt.input_res overrides dataset default
+        input_h = opt.input_res if opt.input_res > 0 else input_h
+        input_w = opt.input_res if opt.input_res > 0 else input_w
+        opt.input_h = opt.input_h if opt.input_h > 0 else input_h
+        opt.input_w = opt.input_w if opt.input_w > 0 else input_w
+        opt.output_h = opt.input_h // opt.down_ratio
+        opt.output_w = opt.input_w // opt.down_ratio
+        opt.input_res = max(opt.input_h, opt.input_w)
+        opt.output_res = max(opt.output_h, opt.output_w)
+
+        if opt.task == 'exdet':
+            # assert opt.dataset in ['coco']
+            num_hm = 1 if opt.agnostic_ex else opt.num_classes
+            opt.heads = {'hm_t': num_hm, 'hm_l': num_hm,
+                         'hm_b': num_hm, 'hm_r': num_hm,
+                         'hm_c': opt.num_classes}
+            if opt.reg_offset:
+                opt.heads.update(
+                    {'reg_t': 2, 'reg_l': 2, 'reg_b': 2, 'reg_r': 2})
+        elif opt.task == 'ddd':
+            # assert opt.dataset in ['gta', 'kitti', 'viper']
+            opt.heads = {'hm': opt.num_classes, 'dep': 1, 'rot': 8, 'dim': 3}
+            if opt.reg_bbox:
+                opt.heads.update(
+                    {'wh': 2})
+            if opt.reg_offset:
+                opt.heads.update({'reg': 2})
+        elif opt.task == 'ctdet':
+            # assert opt.dataset in ['pascal', 'coco']
+            opt.heads = {'hm': opt.num_classes,
+                         'wh': 2 if not opt.cat_spec_wh else 2 * opt.num_classes}
+            if opt.reg_offset:
+                opt.heads.update({'reg': 2})
+
+        elif opt.task == 'ctseg':
+            opt.heads = {'hm': opt.num_classes,
+                         'wh': 2 if not opt.cat_spec_wh else 2 * opt.num_classes,
+                         'conv_weight': 2*opt.seg_feat_channel**2 + 5*opt.seg_feat_channel + 1,
+                         'seg_feat': opt.seg_feat_channel
+                         }
+            if opt.reg_offset:
+                opt.heads.update({'reg': 2})
+        elif opt.task == 'multi_pose':
+            # assert opt.dataset in ['coco_hp']
+            opt.flip_idx = dataset.flip_idx
+            opt.heads = {'hm': opt.num_classes, 'wh': 2, 'hps': 34}
+            if opt.reg_offset:
+                opt.heads.update({'reg': 2})
+            if opt.hm_hp:
+                opt.heads.update({'hm_hp': 17})
+            if opt.reg_hp_offset:
+                opt.heads.update({'hp_offset': 2})
+        else:
+            assert 0, 'task not defined!'
+        print('heads', opt.heads)
+        return opt
+
+    def init(self, args=''):
+        default_dataset_info = {
+            'ctdet': {'default_resolution': [512, 512], 'num_classes': 80,
+                      'mean': [0.408, 0.447, 0.470], 'std': [0.289, 0.274, 0.278],
+                      'dataset': 'coco'},
+            'ctseg': {'default_resolution': [512, 512], 'num_classes': 80,
+                      'mean': [0.408, 0.447, 0.470], 'std': [0.289, 0.274, 0.278],
+                      'dataset': 'coco'},
+            'exdet': {'default_resolution': [512, 512], 'num_classes': 80,
+                      'mean': [0.408, 0.447, 0.470], 'std': [0.289, 0.274, 0.278],
+                      'dataset': 'coco'},
+            'multi_pose': {
+                'default_resolution': [512, 512], 'num_classes': 1,
+                'mean': [0.408, 0.447, 0.470], 'std': [0.289, 0.274, 0.278],
+                'dataset': 'coco_hp', 'num_joints': 17,
+                'flip_idx': [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10],
+                             [11, 12], [13, 14], [15, 16]]},
+            'ddd': {'default_resolution': [384, 1280], 'num_classes': 3,
+                    'mean': [0.485, 0.456, 0.406], 'std': [0.229, 0.224, 0.225],
+                    'dataset': 'kitti'},
+        }
+
+        class Struct:
+            def __init__(self, entries):
+                for k, v in entries.items():
+                    self.__setattr__(k, v)
+        opt = self.parse(args)
+        dataset = Struct(default_dataset_info[opt.task])
+        opt.dataset = dataset.dataset
+        opt = self.update_dataset_info_and_set_heads(opt, dataset)
+        return opt
diff --git a/src/lib/trains/base_trainer.py b/src/lib/trains/base_trainer.py
new file mode 100755
index 0000000..96655cd
--- /dev/null
+++ b/src/lib/trains/base_trainer.py
@@ -0,0 +1,120 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+import torch
+from progress.bar import Bar
+from models.data_parallel import DataParallel
+from utils.utils import AverageMeter, collate
+
+
+class ModelWithLoss(torch.nn.Module):
+  def __init__(self, model, loss):
+    super(ModelWithLoss, self).__init__()
+    self.model = model
+    self.loss = loss
+  
+  def forward(self, batch):
+    outputs = self.model(batch['input'])
+    loss, loss_stats = self.loss(outputs, batch)
+    return outputs[-1], loss, loss_stats
+
+class BaseTrainer(object):
+  def __init__(
+    self, opt, model, optimizer=None):
+    self.opt = opt
+    self.optimizer = optimizer
+    self.loss_stats, self.loss = self._get_losses(opt)
+    self.model_with_loss = ModelWithLoss(model, self.loss)
+
+  def set_device(self, gpus, chunk_sizes, device):
+    if len(gpus) > 1:
+      self.model_with_loss = DataParallel(
+        self.model_with_loss, device_ids=gpus, 
+        chunk_sizes=chunk_sizes).to(device)
+    else:
+      self.model_with_loss = self.model_with_loss.to(device)
+    
+    for state in self.optimizer.state.values():
+      for k, v in state.items():
+        if isinstance(v, torch.Tensor):
+          state[k] = v.to(device=device, non_blocking=True)
+
+  def run_epoch(self, phase, epoch, data_loader):
+    model_with_loss = self.model_with_loss
+    if phase == 'train':
+      model_with_loss.train()
+    else:
+      if len(self.opt.gpus) > 1:
+        model_with_loss = self.model_with_loss.module
+      model_with_loss.eval()
+      torch.cuda.empty_cache()
+
+    opt = self.opt
+    results = {}
+    data_time, batch_time = AverageMeter(), AverageMeter()
+    avg_loss_stats = {l: AverageMeter() for l in self.loss_stats}
+    num_iters = len(data_loader) if opt.num_iters < 0 else opt.num_iters
+    Bar.check_tty = False #Fixes Progress bar in colab
+    bar = Bar('{}/{}'.format(opt.task, opt.exp_id), max=num_iters)
+    end = time.time()
+    for iter_id, batch in enumerate(data_loader):
+      if iter_id >= num_iters:
+        break
+      data_time.update(time.time() - end)
+
+      for k in batch:
+        if k != 'meta':
+          batch[k] = batch[k].to(device=opt.device, non_blocking=True)    
+      output, loss, loss_stats = model_with_loss(batch)
+      loss = loss.mean()
+      if phase == 'train':
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+      batch_time.update(time.time() - end)
+      end = time.time()
+
+      Bar.suffix = '{phase}: [{0}][{1}/{2}]|Tot: {total:} |ETA: {eta:} '.format(
+        epoch, iter_id, num_iters, phase=phase,
+        total=bar.elapsed_td, eta=bar.eta_td)
+      for l in avg_loss_stats:
+        avg_loss_stats[l].update(
+          loss_stats[l].mean().item(), batch['input'].size(0))
+        Bar.suffix = Bar.suffix + '|{} {:.4f} '.format(l, avg_loss_stats[l].avg)
+      if not opt.hide_data_time:
+        Bar.suffix = Bar.suffix + '|Data {dt.val:.3f}s({dt.avg:.3f}s) ' \
+          '|Net {bt.avg:.3f}s'.format(dt=data_time, bt=batch_time)
+      if opt.print_iter > 0:
+        if iter_id % opt.print_iter == 0:
+          print('{}/{}| {}'.format(opt.task, opt.exp_id, Bar.suffix)) 
+      else:
+        bar.next()
+      
+      if opt.debug > 0:
+        self.debug(batch, output, iter_id)
+      
+      if opt.test:
+        self.save_result(output, batch, results)
+      del output, loss, loss_stats
+    
+    bar.finish()
+    ret = {k: v.avg for k, v in avg_loss_stats.items()}
+    ret['time'] = bar.elapsed_td.total_seconds() / 60.
+    return ret, results
+  
+  def debug(self, batch, output, iter_id):
+    raise NotImplementedError
+
+  def save_result(self, output, batch, results):
+    raise NotImplementedError
+
+  def _get_losses(self, opt):
+    raise NotImplementedError
+  
+  def val(self, epoch, data_loader):
+    return self.run_epoch('val', epoch, data_loader)
+
+  def train(self, epoch, data_loader):
+    return self.run_epoch('train', epoch, data_loader)
\ No newline at end of file
diff --git a/src/lib/trains/ctdet.py b/src/lib/trains/ctdet.py
new file mode 100755
index 0000000..4debc0b
--- /dev/null
+++ b/src/lib/trains/ctdet.py
@@ -0,0 +1,132 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import numpy as np
+
+from models.losses import FocalLoss
+from models.losses import RegL1Loss, RegLoss, NormRegL1Loss, RegWeightedL1Loss
+from models.decode import ctdet_decode
+from models.utils import _sigmoid
+from utils.debugger import Debugger
+from utils.post_process import ctdet_post_process
+from utils.oracle_utils import gen_oracle_map
+from .base_trainer import BaseTrainer
+
+class CtdetLoss(torch.nn.Module):
+  def __init__(self, opt):
+    super(CtdetLoss, self).__init__()
+    self.crit = torch.nn.MSELoss() if opt.mse_loss else FocalLoss()
+    self.crit_reg = RegL1Loss() if opt.reg_loss == 'l1' else \
+              RegLoss() if opt.reg_loss == 'sl1' else None
+    self.crit_wh = torch.nn.L1Loss(reduction='sum') if opt.dense_wh else \
+              NormRegL1Loss() if opt.norm_wh else \
+              RegWeightedL1Loss() if opt.cat_spec_wh else self.crit_reg
+    self.opt = opt
+
+  def forward(self, outputs, batch):
+    opt = self.opt
+    hm_loss, wh_loss, off_loss = 0, 0, 0
+    for s in range(opt.num_stacks):
+      output = outputs[s]
+      if not opt.mse_loss:
+        output['hm'] = _sigmoid(output['hm'])
+
+      if opt.eval_oracle_hm:
+        output['hm'] = batch['hm']
+      if opt.eval_oracle_wh:
+        output['wh'] = torch.from_numpy(gen_oracle_map(
+          batch['wh'].detach().cpu().numpy(), 
+          batch['ind'].detach().cpu().numpy(), 
+          output['wh'].shape[3], output['wh'].shape[2])).to(opt.device)
+      if opt.eval_oracle_offset:
+        output['reg'] = torch.from_numpy(gen_oracle_map(
+          batch['reg'].detach().cpu().numpy(), 
+          batch['ind'].detach().cpu().numpy(), 
+          output['reg'].shape[3], output['reg'].shape[2])).to(opt.device)
+
+      hm_loss += self.crit(output['hm'], batch['hm']) / opt.num_stacks
+      if opt.wh_weight > 0:
+        if opt.dense_wh:
+          mask_weight = batch['dense_wh_mask'].sum() + 1e-4
+          wh_loss += (
+            self.crit_wh(output['wh'] * batch['dense_wh_mask'],
+            batch['dense_wh'] * batch['dense_wh_mask']) / 
+            mask_weight) / opt.num_stacks
+        elif opt.cat_spec_wh:
+          wh_loss += self.crit_wh(
+            output['wh'], batch['cat_spec_mask'],
+            batch['ind'], batch['cat_spec_wh']) / opt.num_stacks
+        else:
+          wh_loss += self.crit_reg(
+            output['wh'], batch['reg_mask'],
+            batch['ind'], batch['wh']) / opt.num_stacks
+      
+      if opt.reg_offset and opt.off_weight > 0:
+        off_loss += self.crit_reg(output['reg'], batch['reg_mask'],
+                             batch['ind'], batch['reg']) / opt.num_stacks
+        
+    loss = opt.hm_weight * hm_loss + opt.wh_weight * wh_loss + \
+           opt.off_weight * off_loss
+    loss_stats = {'loss': loss, 'hm_loss': hm_loss,
+                  'wh_loss': wh_loss, 'off_loss': off_loss}
+    return loss, loss_stats
+
+class CtdetTrainer(BaseTrainer):
+  def __init__(self, opt, model, optimizer=None):
+    super(CtdetTrainer, self).__init__(opt, model, optimizer=optimizer)
+  
+  def _get_losses(self, opt):
+    loss_states = ['loss', 'hm_loss', 'wh_loss', 'off_loss']
+    loss = CtdetLoss(opt)
+    return loss_states, loss
+
+  def debug(self, batch, output, iter_id):
+    opt = self.opt
+    reg = output['reg'] if opt.reg_offset else None
+    dets = ctdet_decode(
+      output['hm'], output['wh'], reg=reg,
+      cat_spec_wh=opt.cat_spec_wh, K=opt.K)
+    dets = dets.detach().cpu().numpy().reshape(1, -1, dets.shape[2])
+    dets[:, :, :4] *= opt.down_ratio
+    dets_gt = batch['meta']['gt_det'].numpy().reshape(1, -1, dets.shape[2])
+    dets_gt[:, :, :4] *= opt.down_ratio
+    for i in range(1):
+      debugger = Debugger(
+        dataset=opt.dataset, ipynb=(opt.debug==3), theme=opt.debugger_theme)
+      img = batch['input'][i].detach().cpu().numpy().transpose(1, 2, 0)
+      img = np.clip(((
+        img * opt.std + opt.mean) * 255.), 0, 255).astype(np.uint8)
+      pred = debugger.gen_colormap(output['hm'][i].detach().cpu().numpy())
+      gt = debugger.gen_colormap(batch['hm'][i].detach().cpu().numpy())
+      debugger.add_blend_img(img, pred, 'pred_hm')
+      debugger.add_blend_img(img, gt, 'gt_hm')
+      debugger.add_img(img, img_id='out_pred')
+      for k in range(len(dets[i])):
+        if dets[i, k, 4] > opt.center_thresh:
+          debugger.add_coco_bbox(dets[i, k, :4], dets[i, k, -1],
+                                 dets[i, k, 4], img_id='out_pred')
+
+      debugger.add_img(img, img_id='out_gt')
+      for k in range(len(dets_gt[i])):
+        if dets_gt[i, k, 4] > opt.center_thresh:
+          debugger.add_coco_bbox(dets_gt[i, k, :4], dets_gt[i, k, -1],
+                                 dets_gt[i, k, 4], img_id='out_gt')
+
+      if opt.debug == 4:
+        debugger.save_all_imgs(opt.debug_dir, prefix='{}'.format(iter_id))
+      else:
+        debugger.show_all_imgs(pause=True)
+
+  def save_result(self, output, batch, results):
+    reg = output['reg'] if self.opt.reg_offset else None
+    dets = ctdet_decode(
+      output['hm'], output['wh'], reg=reg,
+      cat_spec_wh=self.opt.cat_spec_wh, K=self.opt.K)
+    dets = dets.detach().cpu().numpy().reshape(1, -1, dets.shape[2])
+    dets_out = ctdet_post_process(
+      dets.copy(), batch['meta']['c'].cpu().numpy(),
+      batch['meta']['s'].cpu().numpy(),
+      output['hm'].shape[2], output['hm'].shape[3], output['hm'].shape[1])
+    results[batch['meta']['img_id'].cpu().numpy()[0]] = dets_out[0]
\ No newline at end of file
diff --git a/src/lib/trains/ctseg.py b/src/lib/trains/ctseg.py
new file mode 100644
index 0000000..d5721fc
--- /dev/null
+++ b/src/lib/trains/ctseg.py
@@ -0,0 +1,140 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import numpy as np
+
+from models.losses import FocalLoss, DiceLoss
+from models.losses import RegL1Loss, RegLoss, NormRegL1Loss, RegWeightedL1Loss
+from models.decode import ctdet_decode
+from models.utils import _sigmoid
+from utils.debugger import Debugger
+from utils.post_process import ctdet_post_process
+from utils.oracle_utils import gen_oracle_map
+from .base_trainer import BaseTrainer
+
+
+class CtsegLoss(torch.nn.Module):
+    def __init__(self, opt):
+        super(CtsegLoss, self).__init__()
+        self.crit = torch.nn.MSELoss() if opt.mse_loss else FocalLoss()
+        self.crit_reg = RegL1Loss() if opt.reg_loss == 'l1' else \
+            RegLoss() if opt.reg_loss == 'sl1' else None
+        self.crit_wh = torch.nn.L1Loss(reduction='sum') if opt.dense_wh else \
+            NormRegL1Loss() if opt.norm_wh else \
+            RegWeightedL1Loss() if opt.cat_spec_wh else self.crit_reg
+        self.crit_mask = DiceLoss(opt.seg_feat_channel)
+        self.opt = opt
+
+    def forward(self, outputs, batch):
+        opt = self.opt
+        hm_loss, wh_loss, off_loss, mask_loss = 0, 0, 0, 0
+        for s in range(opt.num_stacks):
+            output = outputs[s]
+            if not opt.mse_loss:
+                output['hm'] = _sigmoid(output['hm'])
+
+            if opt.eval_oracle_hm:
+                output['hm'] = batch['hm']
+            if opt.eval_oracle_wh:
+                output['wh'] = torch.from_numpy(gen_oracle_map(
+                    batch['wh'].detach().cpu().numpy(),
+                    batch['ind'].detach().cpu().numpy(),
+                    output['wh'].shape[3], output['wh'].shape[2])).to(opt.device)
+            if opt.eval_oracle_offset:
+                output['reg'] = torch.from_numpy(gen_oracle_map(
+                    batch['reg'].detach().cpu().numpy(),
+                    batch['ind'].detach().cpu().numpy(),
+                    output['reg'].shape[3], output['reg'].shape[2])).to(opt.device)
+
+            hm_loss += self.crit(output['hm'], batch['hm']) / opt.num_stacks
+            if opt.wh_weight > 0:
+                if opt.dense_wh:
+                    mask_weight = batch['dense_wh_mask'].sum() + 1e-4
+                    wh_loss += (
+                        self.crit_wh(output['wh'] * batch['dense_wh_mask'],
+                                     batch['dense_wh'] * batch['dense_wh_mask']) /
+                        mask_weight) / opt.num_stacks
+                elif opt.cat_spec_wh:
+                    wh_loss += self.crit_wh(
+                        output['wh'], batch['cat_spec_mask'],
+                        batch['ind'], batch['cat_spec_wh']) / opt.num_stacks
+                else:
+                    wh_loss += self.crit_reg(
+                        output['wh'], batch['reg_mask'],
+                        batch['ind'], batch['wh']) / opt.num_stacks
+
+            if opt.reg_offset and opt.off_weight > 0:
+                off_loss += self.crit_reg(output['reg'], batch['reg_mask'],
+                                          batch['ind'], batch['reg']) / opt.num_stacks
+
+            mask_loss += self.crit_mask(output['seg_feat'], output['conv_weight'],
+                                        batch['reg_mask'], batch['ind'], batch['instance_mask'])
+
+        loss = opt.hm_weight * hm_loss + opt.wh_weight * wh_loss + \
+            opt.off_weight * off_loss + opt.seg_weight * mask_loss
+        loss_stats = {'loss': loss, 'hm_loss': hm_loss,
+                      'wh_loss': wh_loss, 'off_loss': off_loss, "mask_loss": mask_loss}
+        return loss, loss_stats
+
+
+class CtsegTrainer(BaseTrainer):
+    def __init__(self, opt, model, optimizer=None):
+        super(CtsegTrainer, self).__init__(opt, model, optimizer=optimizer)
+
+    def _get_losses(self, opt):
+        loss_states = ['loss', 'hm_loss', 'wh_loss', 'off_loss', 'mask_loss']
+        loss = CtsegLoss(opt)
+        return loss_states, loss
+
+    def debug(self, batch, output, iter_id):
+        opt = self.opt
+        reg = output['reg'] if opt.reg_offset else None
+        dets = ctdet_decode(
+            output['hm'], output['wh'], reg=reg,
+            cat_spec_wh=opt.cat_spec_wh, K=opt.K)
+        dets = dets.detach().cpu().numpy().reshape(1, -1, dets.shape[2])
+        dets[:, :, :4] *= opt.down_ratio
+        dets_gt = batch['meta']['gt_det'].numpy().reshape(1, -1, dets.shape[2])
+        dets_gt[:, :, :4] *= opt.down_ratio
+        for i in range(1):
+            debugger = Debugger(
+                dataset=opt.dataset, ipynb=(opt.debug == 3), theme=opt.debugger_theme)
+            img = batch['input'][i].detach().cpu().numpy().transpose(1, 2, 0)
+            img = np.clip(((
+                img * opt.std + opt.mean) * 255.), 0, 255).astype(np.uint8)
+            pred = debugger.gen_colormap(
+                output['hm'][i].detach().cpu().numpy())
+            gt = debugger.gen_colormap(batch['hm'][i].detach().cpu().numpy())
+            debugger.add_blend_img(img, pred, 'pred_hm')
+            debugger.add_blend_img(img, gt, 'gt_hm')
+            debugger.add_img(img, img_id='out_pred')
+            for k in range(len(dets[i])):
+                if dets[i, k, 4] > opt.center_thresh:
+                    debugger.add_coco_bbox(dets[i, k, :4], dets[i, k, -1],
+                                           dets[i, k, 4], img_id='out_pred')
+
+            debugger.add_img(img, img_id='out_gt')
+            for k in range(len(dets_gt[i])):
+                if dets_gt[i, k, 4] > opt.center_thresh:
+                    debugger.add_coco_bbox(dets_gt[i, k, :4], dets_gt[i, k, -1],
+                                           dets_gt[i, k, 4], img_id='out_gt')
+
+            if opt.debug == 4:
+                debugger.save_all_imgs(
+                    opt.debug_dir, prefix='{}'.format(iter_id))
+            else:
+                debugger.show_all_imgs(pause=True)
+
+    def save_result(self, output, batch, results):
+        reg = output['reg'] if self.opt.reg_offset else None
+        dets = ctdet_decode(
+            output['hm'], output['wh'], reg=reg,
+            cat_spec_wh=self.opt.cat_spec_wh, K=self.opt.K)
+        dets = dets.detach().cpu().numpy().reshape(1, -1, dets.shape[2])
+        dets_out = ctdet_post_process(
+            dets.copy(), batch['meta']['c'].cpu().numpy(),
+            batch['meta']['s'].cpu().numpy(),
+            output['hm'].shape[2], output['hm'].shape[3], output['hm'].shape[1])
+        results[batch['meta']['img_id'].cpu().numpy()[0]] = dets_out[0]
diff --git a/src/lib/trains/ddd.py b/src/lib/trains/ddd.py
new file mode 100755
index 0000000..d0bb360
--- /dev/null
+++ b/src/lib/trains/ddd.py
@@ -0,0 +1,155 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import numpy as np
+
+from models.losses import FocalLoss, L1Loss, BinRotLoss
+from models.decode import ddd_decode
+from models.utils import _sigmoid
+from utils.debugger import Debugger
+from utils.post_process import ddd_post_process
+from utils.oracle_utils import gen_oracle_map
+from .base_trainer import BaseTrainer
+
+class DddLoss(torch.nn.Module):
+  def __init__(self, opt):
+    super(DddLoss, self).__init__()
+    self.crit = torch.nn.MSELoss() if opt.mse_loss else FocalLoss()
+    self.crit_reg = L1Loss()
+    self.crit_rot = BinRotLoss()
+    self.opt = opt
+  
+  def forward(self, outputs, batch):
+    opt = self.opt
+
+    hm_loss, dep_loss, rot_loss, dim_loss = 0, 0, 0, 0
+    wh_loss, off_loss = 0, 0
+    for s in range(opt.num_stacks):
+      output = outputs[s]
+      output['hm'] = _sigmoid(output['hm'])
+      output['dep'] = 1. / (output['dep'].sigmoid() + 1e-6) - 1.
+      
+      if opt.eval_oracle_dep:
+        output['dep'] = torch.from_numpy(gen_oracle_map(
+          batch['dep'].detach().cpu().numpy(), 
+          batch['ind'].detach().cpu().numpy(), 
+          opt.output_w, opt.output_h)).to(opt.device)
+      
+      hm_loss += self.crit(output['hm'], batch['hm']) / opt.num_stacks
+      if opt.dep_weight > 0:
+        dep_loss += self.crit_reg(output['dep'], batch['reg_mask'],
+                                  batch['ind'], batch['dep']) / opt.num_stacks
+      if opt.dim_weight > 0:
+        dim_loss += self.crit_reg(output['dim'], batch['reg_mask'],
+                                  batch['ind'], batch['dim']) / opt.num_stacks
+      if opt.rot_weight > 0:
+        rot_loss += self.crit_rot(output['rot'], batch['rot_mask'],
+                                  batch['ind'], batch['rotbin'],
+                                  batch['rotres']) / opt.num_stacks
+      if opt.reg_bbox and opt.wh_weight > 0:
+        wh_loss += self.crit_reg(output['wh'], batch['rot_mask'],
+                                 batch['ind'], batch['wh']) / opt.num_stacks
+      if opt.reg_offset and opt.off_weight > 0:
+        off_loss += self.crit_reg(output['reg'], batch['rot_mask'],
+                                  batch['ind'], batch['reg']) / opt.num_stacks
+    loss = opt.hm_weight * hm_loss + opt.dep_weight * dep_loss + \
+           opt.dim_weight * dim_loss + opt.rot_weight * rot_loss + \
+           opt.wh_weight * wh_loss + opt.off_weight * off_loss
+
+    loss_stats = {'loss': loss, 'hm_loss': hm_loss, 'dep_loss': dep_loss, 
+                  'dim_loss': dim_loss, 'rot_loss': rot_loss, 
+                  'wh_loss': wh_loss, 'off_loss': off_loss}
+    return loss, loss_stats
+
+class DddTrainer(BaseTrainer):
+  def __init__(self, opt, model, optimizer=None):
+    super(DddTrainer, self).__init__(opt, model, optimizer=optimizer)
+  
+  def _get_losses(self, opt):
+    loss_states = ['loss', 'hm_loss', 'dep_loss', 'dim_loss', 'rot_loss', 
+                   'wh_loss', 'off_loss']
+    loss = DddLoss(opt)
+    return loss_states, loss
+
+  def debug(self, batch, output, iter_id):
+      opt = self.opt
+      wh = output['wh'] if opt.reg_bbox else None
+      reg = output['reg'] if opt.reg_offset else None
+      dets = ddd_decode(output['hm'], output['rot'], output['dep'],
+                          output['dim'], wh=wh, reg=reg, K=opt.K)
+
+      # x, y, score, r1-r8, depth, dim1-dim3, cls
+      dets = dets.detach().cpu().numpy().reshape(1, -1, dets.shape[2])
+      calib = batch['meta']['calib'].detach().numpy()
+      # x, y, score, rot, depth, dim1, dim2, dim3
+      # if opt.dataset == 'gta':
+      #   dets[:, 12:15] /= 3
+      dets_pred = ddd_post_process(
+        dets.copy(), batch['meta']['c'].detach().numpy(), 
+        batch['meta']['s'].detach().numpy(), calib, opt)
+      dets_gt = ddd_post_process(
+        batch['meta']['gt_det'].detach().numpy().copy(),
+        batch['meta']['c'].detach().numpy(), 
+        batch['meta']['s'].detach().numpy(), calib, opt)
+      #for i in range(input.size(0)):
+      for i in range(1):
+        debugger = Debugger(dataset=opt.dataset, ipynb=(opt.debug==3),
+                            theme=opt.debugger_theme)
+        img = batch['input'][i].detach().cpu().numpy().transpose(1, 2, 0)
+        img = ((img * self.opt.std + self.opt.mean) * 255.).astype(np.uint8)
+        pred = debugger.gen_colormap(
+          output['hm'][i].detach().cpu().numpy())
+        gt = debugger.gen_colormap(batch['hm'][i].detach().cpu().numpy())
+        debugger.add_blend_img(img, pred, 'hm_pred')
+        debugger.add_blend_img(img, gt, 'hm_gt')
+        # decode
+        debugger.add_ct_detection(
+          img, dets[i], show_box=opt.reg_bbox, center_thresh=opt.center_thresh, 
+          img_id='det_pred')
+        debugger.add_ct_detection(
+          img, batch['meta']['gt_det'][i].cpu().numpy().copy(), 
+          show_box=opt.reg_bbox, img_id='det_gt')
+        debugger.add_3d_detection(
+          batch['meta']['image_path'][i], dets_pred[i], calib[i],
+          center_thresh=opt.center_thresh, img_id='add_pred')
+        debugger.add_3d_detection(
+          batch['meta']['image_path'][i], dets_gt[i], calib[i],
+          center_thresh=opt.center_thresh, img_id='add_gt')
+        # debugger.add_bird_view(
+        #   dets_pred[i], center_thresh=opt.center_thresh, img_id='bird_pred')
+        # debugger.add_bird_view(dets_gt[i], img_id='bird_gt')
+        debugger.add_bird_views(
+          dets_pred[i], dets_gt[i], 
+          center_thresh=opt.center_thresh, img_id='bird_pred_gt')
+        
+        # debugger.add_blend_img(img, pred, 'out', white=True)
+        debugger.compose_vis_add(
+          batch['meta']['image_path'][i], dets_pred[i], calib[i],
+          opt.center_thresh, pred, 'bird_pred_gt', img_id='out')
+        # debugger.add_img(img, img_id='out')
+        if opt.debug ==4:
+          debugger.save_all_imgs(opt.debug_dir, prefix='{}'.format(iter_id))
+        else:
+          debugger.show_all_imgs(pause=True)
+
+  def save_result(self, output, batch, results):
+    opt = self.opt
+    wh = output['wh'] if opt.reg_bbox else None
+    reg = output['reg'] if opt.reg_offset else None
+    dets = ddd_decode(output['hm'], output['rot'], output['dep'],
+                        output['dim'], wh=wh, reg=reg, K=opt.K)
+
+    # x, y, score, r1-r8, depth, dim1-dim3, cls
+    dets = dets.detach().cpu().numpy().reshape(1, -1, dets.shape[2])
+    calib = batch['meta']['calib'].detach().numpy()
+    # x, y, score, rot, depth, dim1, dim2, dim3
+    dets_pred = ddd_post_process(
+      dets.copy(), batch['meta']['c'].detach().numpy(), 
+      batch['meta']['s'].detach().numpy(), calib, opt)
+    img_id = batch['meta']['img_id'].detach().numpy()[0]
+    results[img_id] = dets_pred[0]
+    for j in range(1, opt.num_classes + 1):
+      keep_inds = (results[img_id][j][:, -1] > opt.center_thresh)
+      results[img_id][j] = results[img_id][j][keep_inds]
\ No newline at end of file
diff --git a/src/lib/trains/exdet.py b/src/lib/trains/exdet.py
new file mode 100755
index 0000000..b10d1ab
--- /dev/null
+++ b/src/lib/trains/exdet.py
@@ -0,0 +1,86 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import numpy as np
+import cv2
+import sys
+import time
+from utils.debugger import Debugger
+from models.data_parallel import DataParallel
+from models.losses import FocalLoss, RegL1Loss
+from models.decode import agnex_ct_decode, exct_decode
+from models.utils import _sigmoid
+from .base_trainer import BaseTrainer
+
+class ExdetLoss(torch.nn.Module):
+  def __init__(self, opt):
+    super(ExdetLoss, self).__init__()
+    self.crit = torch.nn.MSELoss() if opt.mse_loss else FocalLoss()
+    self.crit_reg = RegL1Loss()
+    self.opt = opt
+    self.parts = ['t', 'l', 'b', 'r', 'c']
+
+  def forward(self, outputs, batch):
+    opt = self.opt
+    hm_loss, reg_loss = 0, 0
+    for s in range(opt.num_stacks):
+      output = outputs[s]
+      for p in self.parts:
+        tag = 'hm_{}'.format(p)
+        output[tag] = _sigmoid(output[tag])
+        hm_loss += self.crit(output[tag], batch[tag]) / opt.num_stacks
+        if p != 'c' and opt.reg_offset and opt.off_weight > 0:
+          reg_loss += self.crit_reg(output['reg_{}'.format(p)], 
+                                    batch['reg_mask'],
+                                    batch['ind_{}'.format(p)],
+                                    batch['reg_{}'.format(p)]) / opt.num_stacks
+    loss = opt.hm_weight * hm_loss + opt.off_weight * reg_loss
+    loss_stats = {'loss': loss, 'off_loss': reg_loss, 'hm_loss': hm_loss}
+    return loss, loss_stats
+
+class ExdetTrainer(BaseTrainer):
+  def __init__(self, opt, model, optimizer=None):
+    super(ExdetTrainer, self).__init__(opt, model, optimizer=optimizer)
+    self.decode = agnex_ct_decode if opt.agnostic_ex else exct_decode
+
+  def _get_losses(self, opt):
+    loss_states = ['loss', 'hm_loss', 'off_loss']
+    loss = ExdetLoss(opt)
+    return loss_states, loss
+
+  def debug(self, batch, output, iter_id):
+    opt = self.opt
+    detections = self.decode(output['hm_t'], output['hm_l'], 
+                             output['hm_b'], output['hm_r'], 
+                             output['hm_c']).detach().cpu().numpy()
+    detections[:, :, :4] *= opt.input_res / opt.output_res
+    for i in range(1):
+      debugger = Debugger(
+        dataset=opt.dataset, ipynb=(opt.debug==3), theme=opt.debugger_theme)
+      pred_hm = np.zeros((opt.input_res, opt.input_res, 3), dtype=np.uint8)
+      gt_hm = np.zeros((opt.input_res, opt.input_res, 3), dtype=np.uint8)
+      img = batch['input'][i].detach().cpu().numpy().transpose(1, 2, 0)
+      img = ((img * self.opt.std + self.opt.mean) * 255.).astype(np.uint8)
+      for p in self.parts:
+        tag = 'hm_{}'.format(p)
+        pred = debugger.gen_colormap(output[tag][i].detach().cpu().numpy())
+        gt = debugger.gen_colormap(batch[tag][i].detach().cpu().numpy())
+        if p != 'c':
+          pred_hm = np.maximum(pred_hm, pred)
+          gt_hm = np.maximum(gt_hm, gt)
+        if p == 'c' or opt.debug > 2:
+          debugger.add_blend_img(img, pred, 'pred_{}'.format(p))
+          debugger.add_blend_img(img, gt, 'gt_{}'.format(p))
+      debugger.add_blend_img(img, pred_hm, 'pred')
+      debugger.add_blend_img(img, gt_hm, 'gt')
+      debugger.add_img(img, img_id='out')
+      for k in range(len(detections[i])):
+        if detections[i, k, 4] > 0.1:
+          debugger.add_coco_bbox(detections[i, k, :4], detections[i, k, -1],
+                                 detections[i, k, 4], img_id='out')
+      if opt.debug == 4:
+        debugger.save_all_imgs(opt.debug_dir, prefix='{}'.format(iter_id))
+      else:
+        debugger.show_all_imgs(pause=True)
\ No newline at end of file
diff --git a/src/lib/trains/multi_pose.py b/src/lib/trains/multi_pose.py
new file mode 100755
index 0000000..42cd324
--- /dev/null
+++ b/src/lib/trains/multi_pose.py
@@ -0,0 +1,161 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import torch
+import numpy as np
+
+from models.losses import FocalLoss, RegL1Loss, RegLoss, RegWeightedL1Loss
+from models.decode import multi_pose_decode
+from models.utils import _sigmoid, flip_tensor, flip_lr_off, flip_lr
+from utils.debugger import Debugger
+from utils.post_process import multi_pose_post_process
+from utils.oracle_utils import gen_oracle_map
+from .base_trainer import BaseTrainer
+
+class MultiPoseLoss(torch.nn.Module):
+  def __init__(self, opt):
+    super(MultiPoseLoss, self).__init__()
+    self.crit = FocalLoss()
+    self.crit_hm_hp = torch.nn.MSELoss() if opt.mse_loss else FocalLoss()
+    self.crit_kp = RegWeightedL1Loss() if not opt.dense_hp else \
+                   torch.nn.L1Loss(reduction='sum')
+    self.crit_reg = RegL1Loss() if opt.reg_loss == 'l1' else \
+                    RegLoss() if opt.reg_loss == 'sl1' else None
+    self.opt = opt
+
+  def forward(self, outputs, batch):
+    opt = self.opt
+    hm_loss, wh_loss, off_loss = 0, 0, 0
+    hp_loss, off_loss, hm_hp_loss, hp_offset_loss = 0, 0, 0, 0
+    for s in range(opt.num_stacks):
+      output = outputs[s]
+      output['hm'] = _sigmoid(output['hm'])
+      if opt.hm_hp and not opt.mse_loss:
+        output['hm_hp'] = _sigmoid(output['hm_hp'])
+      
+      if opt.eval_oracle_hmhp:
+        output['hm_hp'] = batch['hm_hp']
+      if opt.eval_oracle_hm:
+        output['hm'] = batch['hm']
+      if opt.eval_oracle_kps:
+        if opt.dense_hp:
+          output['hps'] = batch['dense_hps']
+        else:
+          output['hps'] = torch.from_numpy(gen_oracle_map(
+            batch['hps'].detach().cpu().numpy(), 
+            batch['ind'].detach().cpu().numpy(), 
+            opt.output_res, opt.output_res)).to(opt.device)
+      if opt.eval_oracle_hp_offset:
+        output['hp_offset'] = torch.from_numpy(gen_oracle_map(
+          batch['hp_offset'].detach().cpu().numpy(), 
+          batch['hp_ind'].detach().cpu().numpy(), 
+          opt.output_res, opt.output_res)).to(opt.device)
+
+
+      hm_loss += self.crit(output['hm'], batch['hm']) / opt.num_stacks
+      if opt.dense_hp:
+        mask_weight = batch['dense_hps_mask'].sum() + 1e-4
+        hp_loss += (self.crit_kp(output['hps'] * batch['dense_hps_mask'], 
+                                 batch['dense_hps'] * batch['dense_hps_mask']) / 
+                                 mask_weight) / opt.num_stacks
+      else:
+        hp_loss += self.crit_kp(output['hps'], batch['hps_mask'], 
+                                batch['ind'], batch['hps']) / opt.num_stacks
+      if opt.wh_weight > 0:
+        wh_loss += self.crit_reg(output['wh'], batch['reg_mask'],
+                                 batch['ind'], batch['wh']) / opt.num_stacks
+      if opt.reg_offset and opt.off_weight > 0:
+        off_loss += self.crit_reg(output['reg'], batch['reg_mask'],
+                                  batch['ind'], batch['reg']) / opt.num_stacks
+      if opt.reg_hp_offset and opt.off_weight > 0:
+        hp_offset_loss += self.crit_reg(
+          output['hp_offset'], batch['hp_mask'],
+          batch['hp_ind'], batch['hp_offset']) / opt.num_stacks
+      if opt.hm_hp and opt.hm_hp_weight > 0:
+        hm_hp_loss += self.crit_hm_hp(
+          output['hm_hp'], batch['hm_hp']) / opt.num_stacks
+    loss = opt.hm_weight * hm_loss + opt.wh_weight * wh_loss + \
+           opt.off_weight * off_loss + opt.hp_weight * hp_loss + \
+           opt.hm_hp_weight * hm_hp_loss + opt.off_weight * hp_offset_loss
+    
+    loss_stats = {'loss': loss, 'hm_loss': hm_loss, 'hp_loss': hp_loss, 
+                  'hm_hp_loss': hm_hp_loss, 'hp_offset_loss': hp_offset_loss,
+                  'wh_loss': wh_loss, 'off_loss': off_loss}
+    return loss, loss_stats
+
+class MultiPoseTrainer(BaseTrainer):
+  def __init__(self, opt, model, optimizer=None):
+    super(MultiPoseTrainer, self).__init__(opt, model, optimizer=optimizer)
+  
+  def _get_losses(self, opt):
+    loss_states = ['loss', 'hm_loss', 'hp_loss', 'hm_hp_loss', 
+                   'hp_offset_loss', 'wh_loss', 'off_loss']
+    loss = MultiPoseLoss(opt)
+    return loss_states, loss
+
+  def debug(self, batch, output, iter_id):
+    opt = self.opt
+    reg = output['reg'] if opt.reg_offset else None
+    hm_hp = output['hm_hp'] if opt.hm_hp else None
+    hp_offset = output['hp_offset'] if opt.reg_hp_offset else None
+    dets = multi_pose_decode(
+      output['hm'], output['wh'], output['hps'], 
+      reg=reg, hm_hp=hm_hp, hp_offset=hp_offset, K=opt.K)
+    dets = dets.detach().cpu().numpy().reshape(1, -1, dets.shape[2])
+
+    dets[:, :, :4] *= opt.input_res / opt.output_res
+    dets[:, :, 5:39] *= opt.input_res / opt.output_res
+    dets_gt = batch['meta']['gt_det'].numpy().reshape(1, -1, dets.shape[2])
+    dets_gt[:, :, :4] *= opt.input_res / opt.output_res
+    dets_gt[:, :, 5:39] *= opt.input_res / opt.output_res
+    for i in range(1):
+      debugger = Debugger(
+        dataset=opt.dataset, ipynb=(opt.debug==3), theme=opt.debugger_theme)
+      img = batch['input'][i].detach().cpu().numpy().transpose(1, 2, 0)
+      img = np.clip(((
+        img * opt.std + opt.mean) * 255.), 0, 255).astype(np.uint8)
+      pred = debugger.gen_colormap(output['hm'][i].detach().cpu().numpy())
+      gt = debugger.gen_colormap(batch['hm'][i].detach().cpu().numpy())
+      debugger.add_blend_img(img, pred, 'pred_hm')
+      debugger.add_blend_img(img, gt, 'gt_hm')
+
+      debugger.add_img(img, img_id='out_pred')
+      for k in range(len(dets[i])):
+        if dets[i, k, 4] > opt.center_thresh:
+          debugger.add_coco_bbox(dets[i, k, :4], dets[i, k, -1],
+                                 dets[i, k, 4], img_id='out_pred')
+          debugger.add_coco_hp(dets[i, k, 5:39], img_id='out_pred')
+
+      debugger.add_img(img, img_id='out_gt')
+      for k in range(len(dets_gt[i])):
+        if dets_gt[i, k, 4] > opt.center_thresh:
+          debugger.add_coco_bbox(dets_gt[i, k, :4], dets_gt[i, k, -1],
+                                 dets_gt[i, k, 4], img_id='out_gt')
+          debugger.add_coco_hp(dets_gt[i, k, 5:39], img_id='out_gt')
+
+      if opt.hm_hp:
+        pred = debugger.gen_colormap_hp(output['hm_hp'][i].detach().cpu().numpy())
+        gt = debugger.gen_colormap_hp(batch['hm_hp'][i].detach().cpu().numpy())
+        debugger.add_blend_img(img, pred, 'pred_hmhp')
+        debugger.add_blend_img(img, gt, 'gt_hmhp')
+
+      if opt.debug == 4:
+        debugger.save_all_imgs(opt.debug_dir, prefix='{}'.format(iter_id))
+      else:
+        debugger.show_all_imgs(pause=True)
+
+  def save_result(self, output, batch, results):
+    reg = output['reg'] if self.opt.reg_offset else None
+    hm_hp = output['hm_hp'] if self.opt.hm_hp else None
+    hp_offset = output['hp_offset'] if self.opt.reg_hp_offset else None
+    dets = multi_pose_decode(
+      output['hm'], output['wh'], output['hps'], 
+      reg=reg, hm_hp=hm_hp, hp_offset=hp_offset, K=self.opt.K)
+    dets = dets.detach().cpu().numpy().reshape(1, -1, dets.shape[2])
+    
+    dets_out = multi_pose_post_process(
+      dets.copy(), batch['meta']['c'].cpu().numpy(),
+      batch['meta']['s'].cpu().numpy(),
+      output['hm'].shape[2], output['hm'].shape[3])
+    results[batch['meta']['img_id'].cpu().numpy()[0]] = dets_out[0]
\ No newline at end of file
diff --git a/src/lib/trains/train_factory.py b/src/lib/trains/train_factory.py
new file mode 100644
index 0000000..21c6a4c
--- /dev/null
+++ b/src/lib/trains/train_factory.py
@@ -0,0 +1,17 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from .ctdet import CtdetTrainer
+from .ddd import DddTrainer
+from .exdet import ExdetTrainer
+from .multi_pose import MultiPoseTrainer
+from .ctseg import CtsegTrainer
+
+train_factory = {
+    'exdet': ExdetTrainer,
+    'ddd': DddTrainer,
+    'ctdet': CtdetTrainer,
+    'multi_pose': MultiPoseTrainer,
+    'ctseg': CtsegTrainer
+}
diff --git a/src/lib/utils/__init__.py b/src/lib/utils/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/src/lib/utils/ddd_utils.py b/src/lib/utils/ddd_utils.py
new file mode 100644
index 0000000..4f0c594
--- /dev/null
+++ b/src/lib/utils/ddd_utils.py
@@ -0,0 +1,131 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import cv2
+
+def compute_box_3d(dim, location, rotation_y):
+  # dim: 3
+  # location: 3
+  # rotation_y: 1
+  # return: 8 x 3
+  c, s = np.cos(rotation_y), np.sin(rotation_y)
+  R = np.array([[c, 0, s], [0, 1, 0], [-s, 0, c]], dtype=np.float32)
+  l, w, h = dim[2], dim[1], dim[0]
+  x_corners = [l/2, l/2, -l/2, -l/2, l/2, l/2, -l/2, -l/2]
+  y_corners = [0,0,0,0,-h,-h,-h,-h]
+  z_corners = [w/2, -w/2, -w/2, w/2, w/2, -w/2, -w/2, w/2]
+
+  corners = np.array([x_corners, y_corners, z_corners], dtype=np.float32)
+  corners_3d = np.dot(R, corners) 
+  corners_3d = corners_3d + np.array(location, dtype=np.float32).reshape(3, 1)
+  return corners_3d.transpose(1, 0)
+
+def project_to_image(pts_3d, P):
+  # pts_3d: n x 3
+  # P: 3 x 4
+  # return: n x 2
+  pts_3d_homo = np.concatenate(
+    [pts_3d, np.ones((pts_3d.shape[0], 1), dtype=np.float32)], axis=1)
+  pts_2d = np.dot(P, pts_3d_homo.transpose(1, 0)).transpose(1, 0)
+  pts_2d = pts_2d[:, :2] / pts_2d[:, 2:]
+  # import pdb; pdb.set_trace()
+  return pts_2d
+
+def compute_orientation_3d(dim, location, rotation_y):
+  # dim: 3
+  # location: 3
+  # rotation_y: 1
+  # return: 2 x 3
+  c, s = np.cos(rotation_y), np.sin(rotation_y)
+  R = np.array([[c, 0, s], [0, 1, 0], [-s, 0, c]], dtype=np.float32)
+  orientation_3d = np.array([[0, dim[2]], [0, 0], [0, 0]], dtype=np.float32)
+  orientation_3d = np.dot(R, orientation_3d)
+  orientation_3d = orientation_3d + \
+                   np.array(location, dtype=np.float32).reshape(3, 1)
+  return orientation_3d.transpose(1, 0)
+
+def draw_box_3d(image, corners, c=(0, 0, 255)):
+  face_idx = [[0,1,5,4],
+              [1,2,6, 5],
+              [2,3,7,6],
+              [3,0,4,7]]
+  for ind_f in range(3, -1, -1):
+    f = face_idx[ind_f]
+    for j in range(4):
+      cv2.line(image, (corners[f[j], 0], corners[f[j], 1]),
+               (corners[f[(j+1)%4], 0], corners[f[(j+1)%4], 1]), c, 2, lineType=cv2.LINE_AA)
+    if ind_f == 0:
+      cv2.line(image, (corners[f[0], 0], corners[f[0], 1]),
+               (corners[f[2], 0], corners[f[2], 1]), c, 1, lineType=cv2.LINE_AA)
+      cv2.line(image, (corners[f[1], 0], corners[f[1], 1]),
+               (corners[f[3], 0], corners[f[3], 1]), c, 1, lineType=cv2.LINE_AA)
+  return image
+
+def unproject_2d_to_3d(pt_2d, depth, P):
+  # pts_2d: 2
+  # depth: 1
+  # P: 3 x 4
+  # return: 3
+  z = depth - P[2, 3]
+  x = (pt_2d[0] * depth - P[0, 3] - P[0, 2] * z) / P[0, 0]
+  y = (pt_2d[1] * depth - P[1, 3] - P[1, 2] * z) / P[1, 1]
+  pt_3d = np.array([x, y, z], dtype=np.float32)
+  return pt_3d
+
+def alpha2rot_y(alpha, x, cx, fx):
+    """
+    Get rotation_y by alpha + theta - 180
+    alpha : Observation angle of object, ranging [-pi..pi]
+    x : Object center x to the camera center (x-W/2), in pixels
+    rotation_y : Rotation ry around Y-axis in camera coordinates [-pi..pi]
+    """
+    rot_y = alpha + np.arctan2(x - cx, fx)
+    if rot_y > np.pi:
+      rot_y -= 2 * np.pi
+    if rot_y < -np.pi:
+      rot_y += 2 * np.pi
+    return rot_y
+
+def rot_y2alpha(rot_y, x, cx, fx):
+    """
+    Get rotation_y by alpha + theta - 180
+    alpha : Observation angle of object, ranging [-pi..pi]
+    x : Object center x to the camera center (x-W/2), in pixels
+    rotation_y : Rotation ry around Y-axis in camera coordinates [-pi..pi]
+    """
+    alpha = rot_y - np.arctan2(x - cx, fx)
+    if alpha > np.pi:
+      alpha -= 2 * np.pi
+    if alpha < -np.pi:
+      alpha += 2 * np.pi
+    return alpha
+
+
+def ddd2locrot(center, alpha, dim, depth, calib):
+  # single image
+  locations = unproject_2d_to_3d(center, depth, calib)
+  locations[1] += dim[0] / 2
+  rotation_y = alpha2rot_y(alpha, center[0], calib[0, 2], calib[0, 0])
+  return locations, rotation_y
+
+def project_3d_bbox(location, dim, rotation_y, calib):
+  box_3d = compute_box_3d(dim, location, rotation_y)
+  box_2d = project_to_image(box_3d, calib)
+  return box_2d
+
+
+if __name__ == '__main__':
+  calib = np.array(
+    [[7.070493000000e+02, 0.000000000000e+00, 6.040814000000e+02, 4.575831000000e+01],
+     [0.000000000000e+00, 7.070493000000e+02, 1.805066000000e+02, -3.454157000000e-01],
+     [0.000000000000e+00, 0.000000000000e+00, 1.000000000000e+00, 4.981016000000e-03]],
+    dtype=np.float32)
+  alpha = -0.20
+  tl = np.array([712.40, 143.00], dtype=np.float32)
+  br = np.array([810.73, 307.92], dtype=np.float32)
+  ct = (tl + br) / 2
+  rotation_y = 0.01
+  print('alpha2rot_y', alpha2rot_y(alpha, ct[0], calib[0, 2], calib[0, 0]))
+  print('rotation_y', rotation_y)
\ No newline at end of file
diff --git a/src/lib/utils/debugger.py b/src/lib/utils/debugger.py
new file mode 100644
index 0000000..6656e1a
--- /dev/null
+++ b/src/lib/utils/debugger.py
@@ -0,0 +1,561 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import cv2
+from .ddd_utils import compute_box_3d, project_to_image, draw_box_3d
+
+
+class Debugger(object):
+    def __init__(self, ipynb=False, theme='black',
+                 num_classes=-1, dataset=None, down_ratio=4):
+        self.ipynb = ipynb
+        if not self.ipynb:
+            import matplotlib.pyplot as plt
+            self.plt = plt
+        self.imgs = {}
+        self.theme = theme
+        colors = [(color_list[_]).astype(np.uint8)
+                  for _ in range(len(color_list))]
+        self.colors = np.array(colors, dtype=np.uint8).reshape(
+            len(colors), 1, 1, 3)
+        if self.theme == 'white':
+            self.colors = self.colors.reshape(-1)[::-
+                                                  1].reshape(len(colors), 1, 1, 3)
+            self.colors = np.clip(self.colors, 0., 0.6 * 255).astype(np.uint8)
+        self.dim_scale = 1
+        if dataset == 'coco_hp':
+            self.names = ['p']
+            self.num_class = 1
+            self.num_joints = 17
+            self.edges = [[0, 1], [0, 2], [1, 3], [2, 4],
+                          [3, 5], [4, 6], [5, 6],
+                          [5, 7], [7, 9], [6, 8], [8, 10],
+                          [5, 11], [6, 12], [11, 12],
+                          [11, 13], [13, 15], [12, 14], [14, 16]]
+            self.ec = [(255, 0, 0), (0, 0, 255), (255, 0, 0), (0, 0, 255),
+                       (255, 0, 0), (0, 0, 255), (255, 0, 255),
+                       (255, 0, 0), (255, 0, 0), (0, 0, 255), (0, 0, 255),
+                       (255, 0, 0), (0, 0, 255), (255, 0, 255),
+                       (255, 0, 0), (255, 0, 0), (0, 0, 255), (0, 0, 255)]
+            self.colors_hp = [(255, 0, 255),
+                              (255, 0, 0), (0, 0, 255),
+                              (255, 0, 0), (0, 0, 255),
+                              (255, 0, 0), (0, 0, 255),
+                              (255, 0, 0), (0, 0, 255),
+                              (255, 0, 0), (0, 0, 255),
+                              (255, 0, 0), (0, 0, 255),
+                              (255, 0, 0), (0, 0, 255),
+                              (255, 0, 0), (0, 0, 255)]
+        elif num_classes == 80 or dataset == 'coco':
+            self.names = coco_class_name
+        elif num_classes == 20 or dataset == 'pascal':
+            self.names = pascal_class_name
+        elif dataset == 'gta':
+            self.names = gta_class_name
+            self.focal_length = 935.3074360871937
+            self.W = 1920
+            self.H = 1080
+            self.dim_scale = 3
+        elif dataset == 'viper':
+            self.names = gta_class_name
+            self.focal_length = 1158
+            self.W = 1920
+            self.H = 1080
+            self.dim_scale = 3
+        elif num_classes == 3 or dataset == 'kitti':
+            self.names = kitti_class_name
+            self.focal_length = 721.5377
+            self.W = 1242
+            self.H = 375
+        num_classes = len(self.names)
+        self.down_ratio = down_ratio
+        # for bird view
+        self.world_size = 64
+        self.out_size = 384
+
+    def add_img(self, img, img_id='default', revert_color=False):
+        if revert_color:
+            img = 255 - img
+        self.imgs[img_id] = img.copy()
+
+    def add_mask(self, mask, bg, imgId='default', trans=0.8):
+        self.imgs[imgId] = (mask.reshape(
+            mask.shape[0], mask.shape[1], 1) * 255 * trans +
+            bg * (1 - trans)).astype(np.uint8)
+
+    def show_img(self, pause=False, imgId='default'):
+        cv2.imshow('{}'.format(imgId), self.imgs[imgId])
+        if pause:
+            cv2.waitKey()
+
+    def add_blend_img(self, back, fore, img_id='blend', trans=0.7):
+        if self.theme == 'white':
+            fore = 255 - fore
+        if fore.shape[0] != back.shape[0] or fore.shape[0] != back.shape[1]:
+            fore = cv2.resize(fore, (back.shape[1], back.shape[0]))
+        if len(fore.shape) == 2:
+            fore = fore.reshape(fore.shape[0], fore.shape[1], 1)
+        self.imgs[img_id] = (back * (1. - trans) + fore * trans)
+        self.imgs[img_id][self.imgs[img_id] > 255] = 255
+        self.imgs[img_id][self.imgs[img_id] < 0] = 0
+        self.imgs[img_id] = self.imgs[img_id].astype(np.uint8).copy()
+
+    '''
+  # slow version
+  def gen_colormap(self, img, output_res=None):
+    # num_classes = len(self.colors)
+    img[img < 0] = 0
+    h, w = img.shape[1], img.shape[2]
+    if output_res is None:
+      output_res = (h * self.down_ratio, w * self.down_ratio)
+    color_map = np.zeros((output_res[0], output_res[1], 3), dtype=np.uint8)
+    for i in range(img.shape[0]):
+      resized = cv2.resize(img[i], (output_res[1], output_res[0]))
+      resized = resized.reshape(output_res[0], output_res[1], 1)
+      cl = self.colors[i] if not (self.theme == 'white') \
+           else 255 - self.colors[i]
+      color_map = np.maximum(color_map, (resized * cl).astype(np.uint8))
+    return color_map
+    '''
+
+    def gen_colormap(self, img, output_res=None):
+        img = img.copy()
+        c, h, w = img.shape[0], img.shape[1], img.shape[2]
+        if output_res is None:
+            output_res = (h * self.down_ratio, w * self.down_ratio)
+        img = img.transpose(1, 2, 0).reshape(h, w, c, 1).astype(np.float32)
+        colors = np.array(
+            self.colors, dtype=np.float32).reshape(-1, 3)[:c].reshape(1, 1, c, 3)
+        if self.theme == 'white':
+            colors = 255 - colors
+        color_map = (img * colors).max(axis=2).astype(np.uint8)
+        color_map = cv2.resize(color_map, (output_res[0], output_res[1]))
+        return color_map
+
+    '''
+  # slow
+  def gen_colormap_hp(self, img, output_res=None):
+    # num_classes = len(self.colors)
+    # img[img < 0] = 0
+    h, w = img.shape[1], img.shape[2]
+    if output_res is None:
+      output_res = (h * self.down_ratio, w * self.down_ratio)
+    color_map = np.zeros((output_res[0], output_res[1], 3), dtype=np.uint8)
+    for i in range(img.shape[0]):
+      resized = cv2.resize(img[i], (output_res[1], output_res[0]))
+      resized = resized.reshape(output_res[0], output_res[1], 1)
+      cl =  self.colors_hp[i] if not (self.theme == 'white') else \
+        (255 - np.array(self.colors_hp[i]))
+      color_map = np.maximum(color_map, (resized * cl).astype(np.uint8))
+    return color_map
+  '''
+
+    def gen_colormap_hp(self, img, output_res=None):
+        c, h, w = img.shape[0], img.shape[1], img.shape[2]
+        if output_res is None:
+            output_res = (h * self.down_ratio, w * self.down_ratio)
+        img = img.transpose(1, 2, 0).reshape(h, w, c, 1).astype(np.float32)
+        colors = np.array(
+            self.colors_hp, dtype=np.float32).reshape(-1, 3)[:c].reshape(1, 1, c, 3)
+        if self.theme == 'white':
+            colors = 255 - colors
+        color_map = (img * colors).max(axis=2).astype(np.uint8)
+        color_map = cv2.resize(color_map, (output_res[0], output_res[1]))
+        return color_map
+
+    def add_rect(self, rect1, rect2, c, conf=1, img_id='default'):
+        cv2.rectangle(
+            self.imgs[img_id], (rect1[0], rect1[1]), (rect2[0], rect2[1]), c, 2)
+        if conf < 1:
+            cv2.circle(self.imgs[img_id], (rect1[0],
+                                           rect1[1]), int(10 * conf), c, 1)
+            cv2.circle(self.imgs[img_id], (rect2[0],
+                                           rect2[1]), int(10 * conf), c, 1)
+            cv2.circle(self.imgs[img_id], (rect1[0],
+                                           rect2[1]), int(10 * conf), c, 1)
+            cv2.circle(self.imgs[img_id], (rect2[0],
+                                           rect1[1]), int(10 * conf), c, 1)
+
+    def add_coco_bbox(self, bbox, cat, conf=1, show_txt=True, img_id='default'):
+        bbox = np.array(bbox, dtype=np.int32)
+        # cat = (int(cat) + 1) % 80
+        cat = int(cat)
+        # print('cat', cat, self.names[cat])
+        c = self.colors[cat][0][0].tolist()
+        if self.theme == 'white':
+            c = (255 - np.array(c)).tolist()
+        txt = '{}{:.1f}'.format(self.names[cat], conf)
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        cat_size = cv2.getTextSize(txt, font, 0.5, 2)[0]
+        cv2.rectangle(
+            self.imgs[img_id], (bbox[0], bbox[1]), (bbox[2], bbox[3]), c, 2)
+        if show_txt:
+            cv2.rectangle(self.imgs[img_id],
+                          (bbox[0], bbox[1] - cat_size[1] - 2),
+                          (bbox[0] + cat_size[0], bbox[1] - 2), c, -1)
+            cv2.putText(self.imgs[img_id], txt, (bbox[0], bbox[1] - 2),
+                        font, 0.5, (0, 0, 0), thickness=1, lineType=cv2.LINE_AA)
+
+    def add_coco_hp(self, points, img_id='default'):
+        points = np.array(points, dtype=np.int32).reshape(self.num_joints, 2)
+        for j in range(self.num_joints):
+            cv2.circle(self.imgs[img_id],
+                       (points[j, 0], points[j, 1]), 3, self.colors_hp[j], -1)
+        for j, e in enumerate(self.edges):
+            if points[e].min() > 0:
+                cv2.line(self.imgs[img_id], (points[e[0], 0], points[e[0], 1]),
+                         (points[e[1], 0], points[e[1], 1]), self.ec[j], 2,
+                         lineType=cv2.LINE_AA)
+
+    def add_points(self, points, img_id='default'):
+        num_classes = len(points)
+        # assert num_classes == len(self.colors)
+        for i in range(num_classes):
+            for j in range(len(points[i])):
+                c = self.colors[i, 0, 0]
+                cv2.circle(self.imgs[img_id], (points[i][j][0] * self.down_ratio,
+                                               points[i][j][1] * self.down_ratio),
+                           5, (255, 255, 255), -1)
+                cv2.circle(self.imgs[img_id], (points[i][j][0] * self.down_ratio,
+                                               points[i][j][1] * self.down_ratio),
+                           3, (int(c[0]), int(c[1]), int(c[2])), -1)
+
+    def show_all_imgs(self, pause=False, time=0):
+        if not self.ipynb:
+            for i, v in self.imgs.items():
+                cv2.imshow('{}'.format(i), v)
+            if cv2.waitKey(0 if pause else 1) == 27:
+                import sys
+                sys.exit(0)
+        else:
+            self.ax = None
+            nImgs = len(self.imgs)
+            fig = self.plt.figure(figsize=(nImgs * 10, 10))
+            nCols = nImgs
+            nRows = nImgs // nCols
+            for i, (k, v) in enumerate(self.imgs.items()):
+                fig.add_subplot(1, nImgs, i + 1)
+                if len(v.shape) == 3:
+                    self.plt.imshow(cv2.cvtColor(v, cv2.COLOR_BGR2RGB))
+                else:
+                    self.plt.imshow(v)
+            self.plt.show()
+
+    def save_img(self, imgId='default', path='./cache/debug/'):
+        cv2.imwrite(path + '{}.png'.format(imgId), self.imgs[imgId])
+
+    def save_all_imgs(self, path='./cache/debug/', prefix='', genID=False):
+        if genID:
+            try:
+                idx = int(np.loadtxt(path + '/id.txt'))
+            except:
+                idx = 0
+            prefix = idx
+            np.savetxt(path + '/id.txt', np.ones(1) * (idx + 1), fmt='%d')
+        for i, v in self.imgs.items():
+            cv2.imwrite(path + '/{}{}.png'.format(prefix, i), v)
+
+    def remove_side(self, img_id, img):
+        if not (img_id in self.imgs):
+            return
+        ws = img.sum(axis=2).sum(axis=0)
+        l = 0
+        while ws[l] == 0 and l < len(ws):
+            l += 1
+        r = ws.shape[0] - 1
+        while ws[r] == 0 and r > 0:
+            r -= 1
+        hs = img.sum(axis=2).sum(axis=1)
+        t = 0
+        while hs[t] == 0 and t < len(hs):
+            t += 1
+        b = hs.shape[0] - 1
+        while hs[b] == 0 and b > 0:
+            b -= 1
+        self.imgs[img_id] = self.imgs[img_id][t:b+1, l:r+1].copy()
+
+    def project_3d_to_bird(self, pt):
+        pt[0] += self.world_size / 2
+        pt[1] = self.world_size - pt[1]
+        pt = pt * self.out_size / self.world_size
+        return pt.astype(np.int32)
+
+    def add_ct_detection(
+            self, img, dets, show_box=False, show_txt=True,
+            center_thresh=0.5, img_id='det'):
+        # dets: max_preds x 5
+        self.imgs[img_id] = img.copy()
+        if type(dets) == type({}):
+            for cat in dets:
+                for i in range(len(dets[cat])):
+                    if dets[cat][i, 2] > center_thresh:
+                        cl = (self.colors[cat, 0, 0]).tolist()
+                        ct = dets[cat][i, :2].astype(np.int32)
+                        if show_box:
+                            w, h = dets[cat][i, -2], dets[cat][i, -1]
+                            x, y = dets[cat][i, 0], dets[cat][i, 1]
+                            bbox = np.array([x - w / 2, y - h / 2, x + w / 2, y + h / 2],
+                                            dtype=np.float32)
+                            self.add_coco_bbox(
+                                bbox, cat - 1, dets[cat][i, 2],
+                                show_txt=show_txt, img_id=img_id)
+        else:
+            for i in range(len(dets)):
+                if dets[i, 2] > center_thresh:
+                    # print('dets', dets[i])
+                    cat = int(dets[i, -1])
+                    cl = (self.colors[cat, 0, 0] if self.theme == 'black' else
+                          255 - self.colors[cat, 0, 0]).tolist()
+                    ct = dets[i, :2].astype(np.int32) * self.down_ratio
+                    cv2.circle(self.imgs[img_id], (ct[0], ct[1]), 3, cl, -1)
+                    if show_box:
+                        w, h = dets[i, -3] * \
+                            self.down_ratio, dets[i, -2] * self.down_ratio
+                        x, y = dets[i, 0] * \
+                            self.down_ratio, dets[i, 1] * self.down_ratio
+                        bbox = np.array([x - w / 2, y - h / 2, x + w / 2, y + h / 2],
+                                        dtype=np.float32)
+                        self.add_coco_bbox(
+                            bbox, dets[i, -1], dets[i, 2], img_id=img_id)
+
+    def add_3d_detection(
+            self, image_or_path, dets, calib, show_txt=False,
+            center_thresh=0.5, img_id='det'):
+        if isinstance(image_or_path, np.ndarray):
+            self.imgs[img_id] = image_or_path
+        else:
+            self.imgs[img_id] = cv2.imread(image_or_path)
+        for cat in dets:
+            for i in range(len(dets[cat])):
+                cl = (self.colors[cat - 1, 0, 0]).tolist()
+                if dets[cat][i, -1] > center_thresh:
+                    dim = dets[cat][i, 5:8]
+                    loc = dets[cat][i, 8:11]
+                    rot_y = dets[cat][i, 11]
+                    # loc[1] = loc[1] - dim[0] / 2 + dim[0] / 2 / self.dim_scale
+                    # dim = dim / self.dim_scale
+                    if loc[2] > 1:
+                        box_3d = compute_box_3d(dim, loc, rot_y)
+                        box_2d = project_to_image(box_3d, calib)
+                        self.imgs[img_id] = draw_box_3d(
+                            self.imgs[img_id], box_2d, cl)
+
+    def compose_vis_add(
+            self, img_path, dets, calib,
+            center_thresh, pred, bev, img_id='out'):
+        self.imgs[img_id] = cv2.imread(img_path)
+        # h, w = self.imgs[img_id].shape[:2]
+        # pred = cv2.resize(pred, (h, w))
+        h, w = pred.shape[:2]
+        hs, ws = self.imgs[img_id].shape[0] / h, self.imgs[img_id].shape[1] / w
+        self.imgs[img_id] = cv2.resize(self.imgs[img_id], (w, h))
+        self.add_blend_img(self.imgs[img_id], pred, img_id)
+        for cat in dets:
+            for i in range(len(dets[cat])):
+                cl = (self.colors[cat - 1, 0, 0]).tolist()
+                if dets[cat][i, -1] > center_thresh:
+                    dim = dets[cat][i, 5:8]
+                    loc = dets[cat][i, 8:11]
+                    rot_y = dets[cat][i, 11]
+                    # loc[1] = loc[1] - dim[0] / 2 + dim[0] / 2 / self.dim_scale
+                    # dim = dim / self.dim_scale
+                    if loc[2] > 1:
+                        box_3d = compute_box_3d(dim, loc, rot_y)
+                        box_2d = project_to_image(box_3d, calib)
+                        box_2d[:, 0] /= hs
+                        box_2d[:, 1] /= ws
+                        self.imgs[img_id] = draw_box_3d(
+                            self.imgs[img_id], box_2d, cl)
+        self.imgs[img_id] = np.concatenate(
+            [self.imgs[img_id], self.imgs[bev]], axis=1)
+
+    def add_2d_detection(
+            self, img, dets, show_box=False, show_txt=True,
+            center_thresh=0.5, img_id='det'):
+        self.imgs[img_id] = img
+        for cat in dets:
+            for i in range(len(dets[cat])):
+                cl = (self.colors[cat - 1, 0, 0]).tolist()
+                if dets[cat][i, -1] > center_thresh:
+                    bbox = dets[cat][i, 1:5]
+                    self.add_coco_bbox(
+                        bbox, cat - 1, dets[cat][i, -1],
+                        show_txt=show_txt, img_id=img_id)
+
+    def add_bird_view(self, dets, center_thresh=0.3, img_id='bird'):
+        bird_view = np.ones(
+            (self.out_size, self.out_size, 3), dtype=np.uint8) * 230
+        for cat in dets:
+            cl = (self.colors[cat - 1, 0, 0]).tolist()
+            lc = (250, 152, 12)
+            for i in range(len(dets[cat])):
+                if dets[cat][i, -1] > center_thresh:
+                    dim = dets[cat][i, 5:8]
+                    loc = dets[cat][i, 8:11]
+                    rot_y = dets[cat][i, 11]
+                    rect = compute_box_3d(dim, loc, rot_y)[:4, [0, 2]]
+                    for k in range(4):
+                        rect[k] = self.project_3d_to_bird(rect[k])
+                        # cv2.circle(bird_view, (rect[k][0], rect[k][1]), 2, lc, -1)
+                    cv2.polylines(
+                        bird_view, [rect.reshape(-1, 1, 2).astype(np.int32)],
+                        True, lc, 2, lineType=cv2.LINE_AA)
+                    for e in [[0, 1]]:
+                        t = 4 if e == [0, 1] else 1
+                        cv2.line(bird_view, (rect[e[0]][0], rect[e[0]][1]),
+                                 (rect[e[1]][0], rect[e[1]][1]), lc, t,
+                                 lineType=cv2.LINE_AA)
+        self.imgs[img_id] = bird_view
+
+    def add_bird_views(self, dets_dt, dets_gt, center_thresh=0.3, img_id='bird'):
+        alpha = 0.5
+        bird_view = np.ones(
+            (self.out_size, self.out_size, 3), dtype=np.uint8) * 230
+        for ii, (dets, lc, cc) in enumerate(
+            [(dets_gt, (12, 49, 250), (0, 0, 255)),
+             (dets_dt, (250, 152, 12), (255, 0, 0))]):
+            # cc = np.array(lc, dtype=np.uint8).reshape(1, 1, 3)
+            for cat in dets:
+                cl = (self.colors[cat - 1, 0, 0]).tolist()
+                for i in range(len(dets[cat])):
+                    if dets[cat][i, -1] > center_thresh:
+                        dim = dets[cat][i, 5:8]
+                        loc = dets[cat][i, 8:11]
+                        rot_y = dets[cat][i, 11]
+                        rect = compute_box_3d(dim, loc, rot_y)[:4, [0, 2]]
+                        for k in range(4):
+                            rect[k] = self.project_3d_to_bird(rect[k])
+                        if ii == 0:
+                            cv2.fillPoly(
+                                bird_view, [
+                                    rect.reshape(-1, 1, 2).astype(np.int32)],
+                                lc, lineType=cv2.LINE_AA)
+                        else:
+                            cv2.polylines(
+                                bird_view, [
+                                    rect.reshape(-1, 1, 2).astype(np.int32)],
+                                True, lc, 2, lineType=cv2.LINE_AA)
+                        # for e in [[0, 1], [1, 2], [2, 3], [3, 0]]:
+                        for e in [[0, 1]]:
+                            t = 4 if e == [0, 1] else 1
+                            cv2.line(bird_view, (rect[e[0]][0], rect[e[0]][1]),
+                                     (rect[e[1]][0], rect[e[1]][1]), lc, t,
+                                     lineType=cv2.LINE_AA)
+        self.imgs[img_id] = bird_view
+
+
+kitti_class_name = [
+    'p', 'v', 'b'
+]
+
+gta_class_name = [
+    'p', 'v'
+]
+
+pascal_class_name = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus",
+                     "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike",
+                     "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
+
+coco_class_name = [
+    'person', 'bicycle', 'car', 'motorcycle', 'airplane',
+    'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
+    'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
+    'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
+    'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
+    'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
+    'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass',
+    'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
+    'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
+    'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+    'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+    'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+    'scissors', 'teddy bear', 'hair drier', 'toothbrush'
+]
+
+color_list = np.array(
+    [
+        1.000, 1.000, 1.000,
+        0.850, 0.325, 0.098,
+        0.929, 0.694, 0.125,
+        0.494, 0.184, 0.556,
+        0.466, 0.674, 0.188,
+        0.301, 0.745, 0.933,
+        0.635, 0.078, 0.184,
+        0.300, 0.300, 0.300,
+        0.600, 0.600, 0.600,
+        1.000, 0.000, 0.000,
+        1.000, 0.500, 0.000,
+        0.749, 0.749, 0.000,
+        0.000, 1.000, 0.000,
+        0.000, 0.000, 1.000,
+        0.667, 0.000, 1.000,
+        0.333, 0.333, 0.000,
+        0.333, 0.667, 0.000,
+        0.333, 1.000, 0.000,
+        0.667, 0.333, 0.000,
+        0.667, 0.667, 0.000,
+        0.667, 1.000, 0.000,
+        1.000, 0.333, 0.000,
+        1.000, 0.667, 0.000,
+        1.000, 1.000, 0.000,
+        0.000, 0.333, 0.500,
+        0.000, 0.667, 0.500,
+        0.000, 1.000, 0.500,
+        0.333, 0.000, 0.500,
+        0.333, 0.333, 0.500,
+        0.333, 0.667, 0.500,
+        0.333, 1.000, 0.500,
+        0.667, 0.000, 0.500,
+        0.667, 0.333, 0.500,
+        0.667, 0.667, 0.500,
+        0.667, 1.000, 0.500,
+        1.000, 0.000, 0.500,
+        1.000, 0.333, 0.500,
+        1.000, 0.667, 0.500,
+        1.000, 1.000, 0.500,
+        0.000, 0.333, 1.000,
+        0.000, 0.667, 1.000,
+        0.000, 1.000, 1.000,
+        0.333, 0.000, 1.000,
+        0.333, 0.333, 1.000,
+        0.333, 0.667, 1.000,
+        0.333, 1.000, 1.000,
+        0.667, 0.000, 1.000,
+        0.667, 0.333, 1.000,
+        0.667, 0.667, 1.000,
+        0.667, 1.000, 1.000,
+        1.000, 0.000, 1.000,
+        1.000, 0.333, 1.000,
+        1.000, 0.667, 1.000,
+        0.167, 0.000, 0.000,
+        0.333, 0.000, 0.000,
+        0.500, 0.000, 0.000,
+        0.667, 0.000, 0.000,
+        0.833, 0.000, 0.000,
+        1.000, 0.000, 0.000,
+        0.000, 0.167, 0.000,
+        0.000, 0.333, 0.000,
+        0.000, 0.500, 0.000,
+        0.000, 0.667, 0.000,
+        0.000, 0.833, 0.000,
+        0.000, 1.000, 0.000,
+        0.000, 0.000, 0.167,
+        0.000, 0.000, 0.333,
+        0.000, 0.000, 0.500,
+        0.000, 0.000, 0.667,
+        0.000, 0.000, 0.833,
+        0.000, 0.000, 1.000,
+        0.000, 0.000, 0.000,
+        0.143, 0.143, 0.143,
+        0.286, 0.286, 0.286,
+        0.429, 0.429, 0.429,
+        0.571, 0.571, 0.571,
+        0.714, 0.714, 0.714,
+        0.857, 0.857, 0.857,
+        0.000, 0.447, 0.741,
+        0.50, 0.5, 0
+    ]
+).astype(np.float32)
+color_list = color_list.reshape((-1, 3)) * 255
diff --git a/src/lib/utils/image.py b/src/lib/utils/image.py
new file mode 100755
index 0000000..9967a85
--- /dev/null
+++ b/src/lib/utils/image.py
@@ -0,0 +1,230 @@
+# ------------------------------------------------------------------------------
+# Copyright (c) Microsoft
+# Licensed under the MIT License.
+# Written by Bin Xiao (Bin.Xiao@microsoft.com)
+# Modified by Xingyi Zhou
+# ------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import cv2
+import random
+
+def flip(img):
+  return img[:, :, ::-1].copy()  
+
+def transform_preds(coords, center, scale, output_size):
+    target_coords = np.zeros(coords.shape)
+    trans = get_affine_transform(center, scale, 0, output_size, inv=1)
+    for p in range(coords.shape[0]):
+        target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
+    return target_coords
+
+
+def get_affine_transform(center,
+                         scale,
+                         rot,
+                         output_size,
+                         shift=np.array([0, 0], dtype=np.float32),
+                         inv=0):
+    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
+        scale = np.array([scale, scale], dtype=np.float32)
+
+    scale_tmp = scale
+    src_w = scale_tmp[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = get_dir([0, src_w * -0.5], rot_rad)
+    dst_dir = np.array([0, dst_w * -0.5], np.float32)
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    dst = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5], np.float32) + dst_dir
+
+    src[2:, :] = get_3rd_point(src[0, :], src[1, :])
+    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+def affine_transform(pt, t):
+    new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32).T
+    new_pt = np.dot(t, new_pt)
+    return new_pt[:2]
+
+
+def get_3rd_point(a, b):
+    direct = a - b
+    return b + np.array([-direct[1], direct[0]], dtype=np.float32)
+
+
+def get_dir(src_point, rot_rad):
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+
+    src_result = [0, 0]
+    src_result[0] = src_point[0] * cs - src_point[1] * sn
+    src_result[1] = src_point[0] * sn + src_point[1] * cs
+
+    return src_result
+
+
+def crop(img, center, scale, output_size, rot=0):
+    trans = get_affine_transform(center, scale, rot, output_size)
+
+    dst_img = cv2.warpAffine(img,
+                             trans,
+                             (int(output_size[0]), int(output_size[1])),
+                             flags=cv2.INTER_LINEAR)
+
+    return dst_img
+
+
+def gaussian_radius(det_size, min_overlap=0.7):
+  height, width = det_size
+
+  a1  = 1
+  b1  = (height + width)
+  c1  = width * height * (1 - min_overlap) / (1 + min_overlap)
+  sq1 = np.sqrt(b1 ** 2 - 4 * a1 * c1)
+  r1  = (b1 + sq1) / 2
+
+  a2  = 4
+  b2  = 2 * (height + width)
+  c2  = (1 - min_overlap) * width * height
+  sq2 = np.sqrt(b2 ** 2 - 4 * a2 * c2)
+  r2  = (b2 + sq2) / 2
+
+  a3  = 4 * min_overlap
+  b3  = -2 * min_overlap * (height + width)
+  c3  = (min_overlap - 1) * width * height
+  sq3 = np.sqrt(b3 ** 2 - 4 * a3 * c3)
+  r3  = (b3 + sq3) / 2
+  return min(r1, r2, r3)
+
+
+def gaussian2D(shape, sigma=1):
+    m, n = [(ss - 1.) / 2. for ss in shape]
+    y, x = np.ogrid[-m:m+1,-n:n+1]
+
+    h = np.exp(-(x * x + y * y) / (2 * sigma * sigma))
+    h[h < np.finfo(h.dtype).eps * h.max()] = 0
+    return h
+
+def draw_umich_gaussian(heatmap, center, radius, k=1):
+  diameter = 2 * radius + 1
+  gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6)
+  
+  x, y = int(center[0]), int(center[1])
+
+  height, width = heatmap.shape[0:2]
+    
+  left, right = min(x, radius), min(width - x, radius + 1)
+  top, bottom = min(y, radius), min(height - y, radius + 1)
+
+  masked_heatmap  = heatmap[y - top:y + bottom, x - left:x + right]
+  masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:radius + right]
+  if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: # TODO debug
+    np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
+  return heatmap
+
+def draw_dense_reg(regmap, heatmap, center, value, radius, is_offset=False):
+  diameter = 2 * radius + 1
+  gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6)
+  value = np.array(value, dtype=np.float32).reshape(-1, 1, 1)
+  dim = value.shape[0]
+  reg = np.ones((dim, diameter*2+1, diameter*2+1), dtype=np.float32) * value
+  if is_offset and dim == 2:
+    delta = np.arange(diameter*2+1) - radius
+    reg[0] = reg[0] - delta.reshape(1, -1)
+    reg[1] = reg[1] - delta.reshape(-1, 1)
+  
+  x, y = int(center[0]), int(center[1])
+
+  height, width = heatmap.shape[0:2]
+    
+  left, right = min(x, radius), min(width - x, radius + 1)
+  top, bottom = min(y, radius), min(height - y, radius + 1)
+
+  masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+  masked_regmap = regmap[:, y - top:y + bottom, x - left:x + right]
+  masked_gaussian = gaussian[radius - top:radius + bottom,
+                             radius - left:radius + right]
+  masked_reg = reg[:, radius - top:radius + bottom,
+                      radius - left:radius + right]
+  if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: # TODO debug
+    idx = (masked_gaussian >= masked_heatmap).reshape(
+      1, masked_gaussian.shape[0], masked_gaussian.shape[1])
+    masked_regmap = (1-idx) * masked_regmap + idx * masked_reg
+  regmap[:, y - top:y + bottom, x - left:x + right] = masked_regmap
+  return regmap
+
+
+def draw_msra_gaussian(heatmap, center, sigma):
+  tmp_size = sigma * 3
+  mu_x = int(center[0] + 0.5)
+  mu_y = int(center[1] + 0.5)
+  w, h = heatmap.shape[0], heatmap.shape[1]
+  ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
+  br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
+  if ul[0] >= h or ul[1] >= w or br[0] < 0 or br[1] < 0:
+    return heatmap
+  size = 2 * tmp_size + 1
+  x = np.arange(0, size, 1, np.float32)
+  y = x[:, np.newaxis]
+  x0 = y0 = size // 2
+  g = np.exp(- ((x - x0) ** 2 + (y - y0) ** 2) / (2 * sigma ** 2))
+  g_x = max(0, -ul[0]), min(br[0], h) - ul[0]
+  g_y = max(0, -ul[1]), min(br[1], w) - ul[1]
+  img_x = max(0, ul[0]), min(br[0], h)
+  img_y = max(0, ul[1]), min(br[1], w)
+  heatmap[img_y[0]:img_y[1], img_x[0]:img_x[1]] = np.maximum(
+    heatmap[img_y[0]:img_y[1], img_x[0]:img_x[1]],
+    g[g_y[0]:g_y[1], g_x[0]:g_x[1]])
+  return heatmap
+
+def grayscale(image):
+    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+
+def lighting_(data_rng, image, alphastd, eigval, eigvec):
+    alpha = data_rng.normal(scale=alphastd, size=(3, ))
+    image += np.dot(eigvec, eigval * alpha)
+
+def blend_(alpha, image1, image2):
+    image1 *= alpha
+    image2 *= (1 - alpha)
+    image1 += image2
+
+def saturation_(data_rng, image, gs, gs_mean, var):
+    alpha = 1. + data_rng.uniform(low=-var, high=var)
+    blend_(alpha, image, gs[:, :, None])
+
+def brightness_(data_rng, image, gs, gs_mean, var):
+    alpha = 1. + data_rng.uniform(low=-var, high=var)
+    image *= alpha
+
+def contrast_(data_rng, image, gs, gs_mean, var):
+    alpha = 1. + data_rng.uniform(low=-var, high=var)
+    blend_(alpha, image, gs_mean)
+
+def color_aug(data_rng, image, eig_val, eig_vec):
+    functions = [brightness_, contrast_, saturation_]
+    random.shuffle(functions)
+
+    gs = grayscale(image)
+    gs_mean = gs.mean()
+    for f in functions:
+        f(data_rng, image, gs, gs_mean, 0.4)
+    lighting_(data_rng, image, 0.1, eig_val, eig_vec)
diff --git a/src/lib/utils/oracle_utils.py b/src/lib/utils/oracle_utils.py
new file mode 100644
index 0000000..d54c35e
--- /dev/null
+++ b/src/lib/utils/oracle_utils.py
@@ -0,0 +1,42 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import numba
+
+@numba.jit(nopython=True, nogil=True)
+def gen_oracle_map(feat, ind, w, h):
+  # feat: B x maxN x featDim
+  # ind: B x maxN
+  batch_size = feat.shape[0]
+  max_objs = feat.shape[1]
+  feat_dim = feat.shape[2]
+  out = np.zeros((batch_size, feat_dim, h, w), dtype=np.float32)
+  vis = np.zeros((batch_size, h, w), dtype=np.uint8)
+  ds = [(0, 1), (0, -1), (1, 0), (-1, 0)]
+  for i in range(batch_size):
+    queue_ind = np.zeros((h*w*2, 2), dtype=np.int32)
+    queue_feat = np.zeros((h*w*2, feat_dim), dtype=np.float32)
+    head, tail = 0, 0
+    for j in range(max_objs):
+      if ind[i][j] > 0:
+        x, y = ind[i][j] % w, ind[i][j] // w
+        out[i, :, y, x] = feat[i][j]
+        vis[i, y, x] = 1
+        queue_ind[tail] = x, y
+        queue_feat[tail] = feat[i][j]
+        tail += 1
+    while tail - head > 0:
+      x, y = queue_ind[head]
+      f = queue_feat[head]
+      head += 1
+      for (dx, dy) in ds:
+        xx, yy = x + dx, y + dy
+        if xx >= 0 and yy >= 0 and xx < w and yy < h and vis[i, yy, xx] < 1:
+          out[i, :, yy, xx] = f
+          vis[i, yy, xx] = 1
+          queue_ind[tail] = xx, yy
+          queue_feat[tail] = f
+          tail += 1
+  return out
\ No newline at end of file
diff --git a/src/lib/utils/post_process.py b/src/lib/utils/post_process.py
new file mode 100644
index 0000000..1b5e1dd
--- /dev/null
+++ b/src/lib/utils/post_process.py
@@ -0,0 +1,150 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from .image import transform_preds, get_affine_transform
+from .ddd_utils import ddd2locrot
+import cv2
+from pycocotools import mask as mask_utils
+
+
+def get_pred_depth(depth):
+    return depth
+
+
+def get_alpha(rot):
+    # output: (B, 8) [bin1_cls[0], bin1_cls[1], bin1_sin, bin1_cos,
+    #                 bin2_cls[0], bin2_cls[1], bin2_sin, bin2_cos]
+    # return rot[:, 0]
+    idx = rot[:, 1] > rot[:, 5]
+    alpha1 = np.arctan2(rot[:, 2], rot[:, 3]) + (-0.5 * np.pi)
+    alpha2 = np.arctan2(rot[:, 6], rot[:, 7]) + (0.5 * np.pi)
+    return alpha1 * idx + alpha2 * (1 - idx)
+
+
+def ddd_post_process_2d(dets, c, s, opt):
+    # dets: batch x max_dets x dim
+    # return 1-based class det list
+    ret = []
+    include_wh = dets.shape[2] > 16
+    for i in range(dets.shape[0]):
+        top_preds = {}
+        dets[i, :, :2] = transform_preds(
+            dets[i, :, 0:2], c[i], s[i], (opt.output_w, opt.output_h))
+        classes = dets[i, :, -1]
+        for j in range(opt.num_classes):
+            inds = (classes == j)
+            top_preds[j + 1] = np.concatenate([
+                dets[i, inds, :3].astype(np.float32),
+                get_alpha(dets[i, inds, 3:11])[
+                    :, np.newaxis].astype(np.float32),
+                get_pred_depth(dets[i, inds, 11:12]).astype(np.float32),
+                dets[i, inds, 12:15].astype(np.float32)], axis=1)
+            if include_wh:
+                top_preds[j + 1] = np.concatenate([
+                    top_preds[j + 1],
+                    transform_preds(
+                        dets[i, inds, 15:17], c[i], s[i], (opt.output_w, opt.output_h))
+                    .astype(np.float32)], axis=1)
+        ret.append(top_preds)
+    return ret
+
+
+def ddd_post_process_3d(dets, calibs):
+    # dets: batch x max_dets x dim
+    # return 1-based class det list
+    ret = []
+    for i in range(len(dets)):
+        preds = {}
+        for cls_ind in dets[i].keys():
+            preds[cls_ind] = []
+            for j in range(len(dets[i][cls_ind])):
+                center = dets[i][cls_ind][j][:2]
+                score = dets[i][cls_ind][j][2]
+                alpha = dets[i][cls_ind][j][3]
+                depth = dets[i][cls_ind][j][4]
+                dimensions = dets[i][cls_ind][j][5:8]
+                wh = dets[i][cls_ind][j][8:10]
+                locations, rotation_y = ddd2locrot(
+                    center, alpha, dimensions, depth, calibs[0])
+                bbox = [center[0] - wh[0] / 2, center[1] - wh[1] / 2,
+                        center[0] + wh[0] / 2, center[1] + wh[1] / 2]
+                pred = [alpha] + bbox + dimensions.tolist() + \
+                    locations.tolist() + [rotation_y, score]
+                preds[cls_ind].append(pred)
+            preds[cls_ind] = np.array(preds[cls_ind], dtype=np.float32)
+        ret.append(preds)
+    return ret
+
+
+def ddd_post_process(dets, c, s, calibs, opt):
+    # dets: batch x max_dets x dim
+    # return 1-based class det list
+    dets = ddd_post_process_2d(dets, c, s, opt)
+    dets = ddd_post_process_3d(dets, calibs)
+    return dets
+
+
+def ctdet_post_process(dets, c, s, h, w, num_classes):
+    # dets: batch x max_dets x dim
+    # return 1-based class det dict
+    ret = []
+    for i in range(dets.shape[0]):
+        top_preds = {}
+        dets[i, :, :2] = transform_preds(
+            dets[i, :, 0:2], c[i], s[i], (w, h))
+        dets[i, :, 2:4] = transform_preds(
+            dets[i, :, 2:4], c[i], s[i], (w, h))
+        classes = dets[i, :, -1]
+        for j in range(num_classes):
+            inds = (classes == j)
+            top_preds[j + 1] = np.concatenate([
+                dets[i, inds, :4].astype(np.float32),
+                dets[i, inds, 4:5].astype(np.float32)], axis=1).tolist()
+        ret.append(top_preds)
+    return ret
+
+
+def ctseg_post_process(dets, masks, c, s, h, w, img_h, img_w, num_classes):
+    # dets: batch x max_dets x dim
+    # return 1-based class det dict
+    ret = []
+    for i in range(dets.shape[0]):
+        top_preds = {}
+        dets[i, :, :2] = transform_preds(
+            dets[i, :, 0:2], c[i], s[i], (w, h))
+        dets[i, :, 2:4] = transform_preds(
+            dets[i, :, 2:4], c[i], s[i], (w, h))
+        classes = dets[i, :, -1]
+
+        trans = get_affine_transform(c[i], s[i], 0, (w, h), inv=1)
+        for j in range(num_classes):
+            inds = (classes == j)
+
+            top_preds[j + 1] = {'boxs': np.concatenate([
+                dets[i, inds, :4].astype(np.float32),
+                dets[i, inds, 4:5].astype(np.float32)], axis=1),
+                'pred_mask': [mask_utils.encode(
+                    (np.asfortranarray(cv2.warpAffine(mask, trans, (img_w, img_h),
+                                                      flags=cv2.INTER_CUBIC) > 0.5).astype(np.uint8)))
+                for mask in masks[i, inds]]
+            }
+        ret.append(top_preds)
+    return ret
+
+
+def multi_pose_post_process(dets, c, s, h, w):
+    # dets: batch x max_dets x 40
+    # return list of 39 in image coord
+    ret = []
+    for i in range(dets.shape[0]):
+        bbox = transform_preds(
+            dets[i, :, :4].reshape(-1, 2), c[i], s[i], (w, h))
+        pts = transform_preds(
+            dets[i, :, 5:39].reshape(-1, 2), c[i], s[i], (w, h))
+        top_preds = np.concatenate(
+            [bbox.reshape(-1, 4), dets[i, :, 4:5],
+             pts.reshape(-1, 34)], axis=1).astype(np.float32).tolist()
+        ret.append({np.ones(1, dtype=np.int32)[0]: top_preds})
+    return ret
diff --git a/src/lib/utils/utils.py b/src/lib/utils/utils.py
new file mode 100644
index 0000000..b9f72ca
--- /dev/null
+++ b/src/lib/utils/utils.py
@@ -0,0 +1,91 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+# from torch.utils.data.dataloader import int_classes, string_classes
+from torch._six import container_abcs, string_classes, int_classes
+import re
+import collections
+
+import torch
+
+np_str_obj_array_pattern = re.compile(r'[SaUO]')
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        if self.count > 0:
+            self.avg = self.sum / self.count
+
+collate_err_msg_format = (
+    "default_collate: batch must contain tensors, numpy arrays, numbers, "
+    "dicts or lists; found {}")
+
+def collate(batch):
+    r"""Puts each data field into a tensor with outer dimension batch size"""
+
+    elem = batch[0]
+    elem_type = type(elem)
+    if isinstance(elem, torch.Tensor):
+        out = None
+        if torch.utils.data.get_worker_info() is not None:
+            # If we're in a background process, concatenate directly into a
+            # shared memory tensor to avoid an extra copy
+            numel = sum([x.numel() for x in batch])
+            storage = elem.storage()._new_shared(numel)
+            out = elem.new(storage)
+        return torch.stack(batch, 0, out=out)
+    elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
+            and elem_type.__name__ != 'string_':
+        elem = batch[0]
+        if elem_type.__name__ == 'ndarray':
+            # array of string classes and object
+            if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                raise TypeError(collate_err_msg_format.format(elem.dtype))
+
+            return collate([torch.as_tensor(b) for b in batch])
+        elif elem.shape == ():  # scalars
+            return torch.as_tensor(batch)
+    elif isinstance(elem, float):
+        return torch.tensor(batch, dtype=torch.float64)
+    elif isinstance(elem, int_classes):
+        return torch.tensor(batch)
+    elif isinstance(elem, string_classes):
+        return batch
+    
+    elif isinstance(elem, container_abcs.Mapping):
+        res = {key: collate([d[key] for d in batch]) for key in elem if key != 'instance_mask'}
+        if 'instance_mask' in elem:
+            max_obj = max([d['instance_mask'].shape[0] for d in batch])
+            instance_mask = torch.zeros(
+                len(batch), max_obj, *(elem['instance_mask'].shape[1:]))
+            for i in range(len(batch)):
+                num_obj = batch[i]['instance_mask'].shape[0]
+                instance_mask[i, :num_obj] = torch.from_numpy(
+                    batch[i]['instance_mask'])
+            res.update({'instance_mask': instance_mask})
+        return res
+    elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
+        return elem_type(*(collate(samples) for samples in zip(*batch)))
+    elif isinstance(elem, container_abcs.Sequence):
+        # check to make sure that the elements in batch have consistent size
+        it = iter(batch)
+        elem_size = len(next(it))
+        if not all(len(elem) == elem_size for elem in it):
+            raise RuntimeError('each element in list of batch should be of equal size')
+        transposed = zip(*batch)
+        return [collate(samples) for samples in transposed]
+
+    raise TypeError(collate_err_msg_format.format(elem_type))
\ No newline at end of file
diff --git a/src/main.py b/src/main.py
new file mode 100755
index 0000000..fd69b62
--- /dev/null
+++ b/src/main.py
@@ -0,0 +1,109 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import _init_paths
+
+import os
+
+import torch
+import torch.utils.data
+from opts import opts
+from models.model import create_model, load_model, save_model
+from models.data_parallel import DataParallel
+from logger import Logger
+from customdatasets.dataset_factory import get_dataset
+from trains.train_factory import train_factory
+
+from utils.utils import collate
+import sys
+
+
+def main(opt):
+    torch.manual_seed(opt.seed)
+    torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test
+    Dataset = get_dataset(opt.dataset, opt.task)
+    opt = opts().update_dataset_info_and_set_heads(opt, Dataset)
+    print(opt)
+
+    logger = Logger(opt)
+
+    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
+    opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')
+
+    print('Creating model...')
+    model = create_model(opt.arch, opt.heads, opt.head_conv)
+    optimizer = torch.optim.Adam(model.parameters(), opt.lr)
+    start_epoch = 0
+    if opt.load_model != '':
+        model, optimizer, start_epoch = load_model(
+            model, opt.load_model, optimizer, opt.resume, opt.lr, opt.lr_step)
+
+    Trainer = train_factory[opt.task]
+    trainer = Trainer(opt, model, optimizer)
+    trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)
+
+    print('Setting up data...')
+    val_loader = torch.utils.data.DataLoader(
+        Dataset(opt, 'val'),
+        batch_size=1,
+        shuffle=False,
+        num_workers=1,
+        pin_memory=True,
+        collate_fn=collate
+    )
+
+    if opt.test:
+        _, preds = trainer.val(0, val_loader)
+        val_loader.dataset.run_eval(preds, opt.save_dir)
+        return
+
+    train_loader = torch.utils.data.DataLoader(
+        Dataset(opt, 'train'),
+        batch_size=opt.batch_size,
+        shuffle=True,
+        num_workers=opt.num_workers,
+        pin_memory=True,
+        drop_last=True,
+        collate_fn=collate
+    )
+
+    print('Starting training...')
+    best = 1e10
+    for epoch in range(start_epoch + 1, opt.num_epochs + 1):
+        mark = epoch if opt.save_all else 'last'
+        log_dict_train, _ = trainer.train(epoch, train_loader)
+        logger.write('epoch: {} |'.format(epoch))
+        for k, v in log_dict_train.items():
+            logger.scalar_summary('train_{}'.format(k), v, epoch)
+            logger.write('{} {:8f} | '.format(k, v))
+        if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
+            save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)),
+                       epoch, model, optimizer)
+            with torch.no_grad():
+                log_dict_val, preds = trainer.val(epoch, val_loader)
+            for k, v in log_dict_val.items():
+                logger.scalar_summary('val_{}'.format(k), v, epoch)
+                logger.write('{} {:8f} | '.format(k, v))
+            if log_dict_val[opt.metric] < best:
+                best = log_dict_val[opt.metric]
+                save_model(os.path.join(opt.save_dir, 'model_best.pth'),
+                           epoch, model)
+        else:
+            save_model(os.path.join(opt.save_dir, 'model_last.pth'),
+                       epoch, model, optimizer)
+        logger.write('\n')
+        if epoch in opt.lr_step:
+            save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)),
+                       epoch, model, optimizer)
+            lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1))
+            print('Drop LR to', lr)
+            for param_group in optimizer.param_groups:
+                param_group['lr'] = lr
+    logger.close()
+
+
+if __name__ == '__main__':
+    print(sys.argv)
+    opt = opts().parse()
+    main(opt)
diff --git a/src/test.py b/src/test.py
new file mode 100755
index 0000000..4d7c801
--- /dev/null
+++ b/src/test.py
@@ -0,0 +1,132 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import _init_paths
+
+import os
+import json
+import cv2
+import numpy as np
+import time
+from progress.bar import Bar
+import torch
+
+from external.nms import soft_nms
+from opts import opts
+from logger import Logger
+from utils.utils import AverageMeter
+from customdatasets.dataset_factory import dataset_factory
+from detectors.detector_factory import detector_factory
+
+
+class PrefetchDataset(torch.utils.data.Dataset):
+    def __init__(self, opt, dataset, pre_process_func):
+        self.images = dataset.images
+        self.load_image_func = dataset.coco.loadImgs
+        self.img_dir = dataset.img_dir
+        self.pre_process_func = pre_process_func
+        self.opt = opt
+
+    def __getitem__(self, index):
+        img_id = self.images[index]
+        img_info = self.load_image_func(ids=[img_id])[0]
+        img_path = os.path.join(self.img_dir, img_info['file_name'])
+        image = cv2.imread(img_path)
+        images, meta = {}, {}
+        for scale in opt.test_scales:
+            if opt.task == 'ddd':
+                images[scale], meta[scale] = self.pre_process_func(
+                    image, scale, img_info['calib'])
+            else:
+                images[scale], meta[scale] = self.pre_process_func(
+                    image, scale)
+        return img_id, {'images': images, 'image': image, 'meta': meta}
+
+    def __len__(self):
+        return len(self.images)
+
+
+def prefetch_test(opt):
+    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
+
+    Dataset = dataset_factory[opt.dataset]
+    opt = opts().update_dataset_info_and_set_heads(opt, Dataset)
+    print(opt)
+    Logger(opt)
+    Detector = detector_factory[opt.task]
+
+    split = 'val' if not opt.trainval else 'test'
+    dataset = Dataset(opt, split)
+    detector = Detector(opt)
+
+    data_loader = torch.utils.data.DataLoader(
+        PrefetchDataset(opt, dataset, detector.pre_process),
+        batch_size=1, shuffle=False, num_workers=1, pin_memory=True)
+
+    results = {}
+    num_iters = len(dataset)
+    bar = Bar('{}'.format(opt.exp_id), max=num_iters)
+    time_stats = ['tot', 'load', 'pre', 'net', 'dec', 'post', 'merge']
+    avg_time_stats = {t: AverageMeter() for t in time_stats}
+    for ind, (img_id, pre_processed_images) in enumerate(data_loader):
+        ret = detector.run(pre_processed_images)
+        results[img_id.numpy().astype(np.int32)[0]] = ret['results']
+        Bar.suffix = '[{0}/{1}]|Tot: {total:} |ETA: {eta:} '.format(
+            ind, num_iters, total=bar.elapsed_td, eta=bar.eta_td)
+        for t in avg_time_stats:
+            avg_time_stats[t].update(ret[t])
+            Bar.suffix = Bar.suffix + '|{} {tm.val:.3f}s ({tm.avg:.3f}s) '.format(
+                t, tm=avg_time_stats[t])
+        bar.next()
+    bar.finish()
+    dataset.run_eval(results, opt.save_dir)
+
+
+def test(opt):
+    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
+
+    Dataset = dataset_factory[opt.dataset]
+    opt = opts().update_dataset_info_and_set_heads(opt, Dataset)
+    print(opt)
+    Logger(opt)
+    Detector = detector_factory[opt.task]
+
+    split = 'val' if not opt.trainval else 'test'
+    dataset = Dataset(opt, split)
+    detector = Detector(opt)
+
+    results = {}
+    num_iters = len(dataset)
+    bar = Bar('{}'.format(opt.exp_id), max=num_iters)
+    time_stats = ['tot', 'load', 'pre', 'net', 'dec', 'post', 'merge']
+    avg_time_stats = {t: AverageMeter() for t in time_stats}
+    for ind in range(num_iters):
+        img_id = dataset.images[ind]
+        img_info = dataset.coco.loadImgs(ids=[img_id])[0]
+        img_path = os.path.join(dataset.img_dir, img_info['file_name'])
+
+        if opt.task == 'ddd':
+            ret = detector.run(img_path, img_info['calib'])
+        else:
+            ret = detector.run(img_path)
+
+        results[img_id] = ret['results']
+
+        Bar.suffix = '[{0}/{1}]|Tot: {total:} |ETA: {eta:} '.format(
+            ind, num_iters, total=bar.elapsed_td, eta=bar.eta_td)
+        for t in avg_time_stats:
+            avg_time_stats[t].update(ret[t])
+            Bar.suffix = Bar.suffix + \
+                '|{} {:.3f} '.format(t, avg_time_stats[t].avg)
+        bar.next()
+    bar.finish()
+    dataset.run_eval(results, opt.save_dir)
+
+
+if __name__ == '__main__':
+    opt = opts().parse()
+    if opt.not_prefetch_test:
+        test(opt)
+    else:
+        prefetch_test(opt)
diff --git a/src/tools/_init_paths.py b/src/tools/_init_paths.py
new file mode 100644
index 0000000..aca4fdb
--- /dev/null
+++ b/src/tools/_init_paths.py
@@ -0,0 +1,12 @@
+import os.path as osp
+import sys
+
+def add_path(path):
+    if path not in sys.path:
+        sys.path.insert(0, path)
+
+this_dir = osp.dirname(__file__)
+
+# Add lib to PYTHONPATH
+lib_path = osp.join(this_dir, '../lib')
+add_path(lib_path)
diff --git a/src/tools/calc_coco_overlap.py b/src/tools/calc_coco_overlap.py
new file mode 100644
index 0000000..a8411e5
--- /dev/null
+++ b/src/tools/calc_coco_overlap.py
@@ -0,0 +1,322 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pycocotools.coco as COCO
+import cv2
+import numpy as np
+from pycocotools import mask as maskUtils
+ANN_PATH = '../../data/coco/annotations/'
+IMG_PATH = '../../data/coco/'
+ANN_FILES = {'train': 'instances_train2017.json',
+             'val': 'instances_val2017.json'}
+DEBUG = False
+RESIZE = True
+
+class_name = [
+    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane',
+    'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
+    'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
+    'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
+    'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
+    'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
+    'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass',
+    'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
+    'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
+    'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+    'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+    'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+    'scissors', 'teddy bear', 'hair drier', 'toothbrush'
+]
+
+def iou(box1, box2):
+  area1 = (box1[2] - box1[0] + 1) * (box1[3] - box1[1] + 1)
+  area2 = (box2[2] - box2[0] + 1) * (box2[3] - box2[1] + 1)
+  inter = max(min(box1[2], box2[2]) - max(box1[0], box2[0]) + 1, 0) * \
+          max(min(box1[3], box2[3]) - max(box1[1], box2[1]) + 1, 0)
+  iou = 1.0 * inter / (area1 + area2 - inter)
+  return iou
+
+def generate_anchors(
+    stride=16, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)
+):
+    """Generates a matrix of anchor boxes in (x1, y1, x2, y2) format. Anchors
+    are centered on stride / 2, have (approximate) sqrt areas of the specified
+    sizes, and aspect ratios as given.
+    """
+    return _generate_anchors(
+        stride,
+        np.array(sizes, dtype=np.float) / stride,
+        np.array(aspect_ratios, dtype=np.float)
+    )
+
+
+def _generate_anchors(base_size, scales, aspect_ratios):
+    """Generate anchor (reference) windows by enumerating aspect ratios X
+    scales wrt a reference (0, 0, base_size - 1, base_size - 1) window.
+    """
+    anchor = np.array([1, 1, base_size, base_size], dtype=np.float) - 1
+    anchors = _ratio_enum(anchor, aspect_ratios)
+    anchors = np.vstack(
+        [_scale_enum(anchors[i, :], scales) for i in range(anchors.shape[0])]
+    )
+    return anchors
+
+
+def _whctrs(anchor):
+    """Return width, height, x center, and y center for an anchor (window)."""
+    w = anchor[2] - anchor[0] + 1
+    h = anchor[3] - anchor[1] + 1
+    x_ctr = anchor[0] + 0.5 * (w - 1)
+    y_ctr = anchor[1] + 0.5 * (h - 1)
+    return w, h, x_ctr, y_ctr
+
+
+def _mkanchors(ws, hs, x_ctr, y_ctr):
+    """Given a vector of widths (ws) and heights (hs) around a center
+    (x_ctr, y_ctr), output a set of anchors (windows).
+    """
+    ws = ws[:, np.newaxis]
+    hs = hs[:, np.newaxis]
+    anchors = np.hstack(
+        (
+            x_ctr - 0.5 * (ws - 1),
+            y_ctr - 0.5 * (hs - 1),
+            x_ctr + 0.5 * (ws - 1),
+            y_ctr + 0.5 * (hs - 1)
+        )
+    )
+    return anchors
+
+
+def _ratio_enum(anchor, ratios):
+    """Enumerate a set of anchors for each aspect ratio wrt an anchor."""
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    size = w * h
+    size_ratios = size / ratios
+    ws = np.round(np.sqrt(size_ratios))
+    hs = np.round(ws * ratios)
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
+
+
+def _scale_enum(anchor, scales):
+    """Enumerate a set of anchors for each scale wrt an anchor."""
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    ws = w * scales
+    hs = h * scales
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
+
+
+def _coco_box_to_bbox(box):
+    bbox = np.array([box[0], box[1], box[0] + box[2], box[1] + box[3]],
+                    dtype=np.float32)
+    return bbox
+
+def count_agnostic(split):
+  coco = COCO.COCO(ANN_PATH + ANN_FILES[split])
+  images = coco.getImgIds()
+  cnt = 0
+  for img_id in images:
+    ann_ids = coco.getAnnIds(imgIds=[img_id])
+    anns = coco.loadAnns(ids=ann_ids)
+    centers = []
+    for ann in anns:
+      bbox = ann['bbox']
+      center = ((bbox[0] + bbox[2] / 2) // 4, (bbox[1] + bbox[3] / 2) // 4)
+      for c in centers:
+        if center[0] == c[0] and center[1] == c[1]:
+          cnt += 1
+      centers.append(center)
+  print('find {} collisions!'.format(cnt))
+
+
+def count(split):
+  coco = COCO.COCO(ANN_PATH + ANN_FILES[split])
+  images = coco.getImgIds()
+  cnt = 0
+  obj = 0
+  for img_id in images:
+    ann_ids = coco.getAnnIds(imgIds=[img_id])
+    anns = coco.loadAnns(ids=ann_ids)
+    centers = []
+    obj += len(anns)
+    for ann in anns:
+      if ann['iscrowd'] > 0:
+        continue
+      bbox = ann['bbox']
+      center = ((bbox[0] + bbox[2] / 2) // 4, (bbox[1] + bbox[3] / 2) // 4, ann['category_id'], bbox)
+      for c in centers:
+        if center[0] == c[0] and center[1] == c[1] and center[2] == c[2] and \
+           iou(_coco_box_to_bbox(bbox), _coco_box_to_bbox(c[3])) < 2:# 0.5:
+          cnt += 1
+          if DEBUG:
+            file_name = coco.loadImgs(ids=[img_id])[0]['file_name']
+            img = cv2.imread('{}/{}2017/{}'.format(IMG_PATH, split, file_name))
+            x1, y1 = int(c[3][0]), int(c[3][1]), 
+            x2, y2 = int(c[3][0] + c[3][2]), int(c[3][1] + c[3][3]) 
+            cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 2, cv2.LINE_AA)
+            x1, y1 = int(center[3][0]), int(center[3][1]), 
+            x2, y2 = int(center[3][0] + center[3][2]), int(center[3][1] + center[3][3]) 
+            cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 2, cv2.LINE_AA)
+            cv2.imshow('img', img)
+            cv2.waitKey()
+      centers.append(center)
+  print('find {} collisions of {} objects!'.format(cnt, obj))
+
+def count_iou(split):
+  coco = COCO.COCO(ANN_PATH + ANN_FILES[split])
+  images = coco.getImgIds()
+  cnt = 0
+  obj = 0
+  for img_id in images:
+    ann_ids = coco.getAnnIds(imgIds=[img_id])
+    anns = coco.loadAnns(ids=ann_ids)
+    bboxes = []
+    obj += len(anns)
+    for ann in anns:
+      if ann['iscrowd'] > 0:
+        continue
+      bbox = _coco_box_to_bbox(ann['bbox']).tolist() + [ann['category_id']]
+      for b in bboxes:
+        if iou(b, bbox) > 0.5 and b[4] == bbox[4]:
+          cnt += 1
+          if DEBUG:
+            file_name = coco.loadImgs(ids=[img_id])[0]['file_name']
+            img = cv2.imread('{}/{}2017/{}'.format(IMG_PATH, split, file_name))
+            x1, y1 = int(b[0]), int(b[1]), 
+            x2, y2 = int(b[2]), int(b[3]) 
+            cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 2, cv2.LINE_AA)
+            x1, y1 = int(bbox[0]), int(bbox[1]), 
+            x2, y2 = int(bbox[2]), int(bbox[3]) 
+            cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 2, cv2.LINE_AA)
+            cv2.imshow('img', img)
+            print('cats', class_name[b[4]], class_name[bbox[4]])
+            cv2.waitKey()
+      bboxes.append(bbox)
+  print('find {} collisions of {} objects!'.format(cnt, obj))
+
+
+def count_anchor(split):
+  coco = COCO.COCO(ANN_PATH + ANN_FILES[split])
+  images = coco.getImgIds()
+  cnt = 0
+  obj = 0
+  stride = 16
+  anchor = generate_anchors().reshape(15, 2, 2)
+  miss_s, miss_m, miss_l = 0, 0, 0
+  N = len(images)
+  print(N, 'images')
+  for ind, img_id in enumerate(images):
+    if ind % 1000 == 0:
+      print(ind, N)
+    anchors = []
+    ann_ids = coco.getAnnIds(imgIds=[img_id])
+    anns = coco.loadAnns(ids=ann_ids)
+    obj += len(anns)
+    img_info = coco.loadImgs(ids=[img_id])[0]
+    h, w = img_info['height'], img_info['width']
+    if RESIZE:
+      if h > w:
+        for i in range(len(anns)):
+          anns[i]['bbox'][0] *= 800 / w
+          anns[i]['bbox'][1] *= 800 / w
+          anns[i]['bbox'][2] *= 800 / w
+          anns[i]['bbox'][3] *= 800 / w
+        h = h * 800 // w
+        w = 800 
+      else:
+        for i in range(len(anns)):
+          anns[i]['bbox'][0] *= 800 / h
+          anns[i]['bbox'][1] *= 800 / h
+          anns[i]['bbox'][2] *= 800 / h
+          anns[i]['bbox'][3] *= 800 / h
+        w = w * 800 // h
+        h = 800 
+    for i in range(w // stride):
+      for j in range(h // stride):
+        ct = np.array([i * stride, j * stride], dtype=np.float32).reshape(1, 1, 2)
+        anchors.append(anchor + ct)
+    anchors = np.concatenate(anchors, axis=0).reshape(-1, 4)
+    anchors[:, 2:4] = anchors[:, 2:4] - anchors[:, 0:2]
+    anchors = anchors.tolist()
+    # import pdb; pdb.set_trace()
+    g = [g['bbox'] for g in anns]
+    iscrowd = [int(o['iscrowd']) for o in anns]
+    ious = maskUtils.iou(anchors,g,iscrowd)
+    for t in range(len(g)):
+      if ious[:, t].max() < 0.5:
+        s = anns[t]['area']
+        if s < 32 ** 2:
+          miss_s += 1
+        elif s < 96 ** 2:
+          miss_m += 1
+        else:
+          miss_l += 1
+    if DEBUG:
+      file_name = coco.loadImgs(ids=[img_id])[0]['file_name']
+      img = cv2.imread('{}/{}2017/{}'.format(IMG_PATH, split, file_name))
+      if RESIZE:
+        img = cv2.resize(img, (w, h))
+      for t, gt in enumerate(g):
+        if anns[t]['iscrowd'] > 0:
+          continue
+        x1, y1, x2, y2 = _coco_box_to_bbox(gt)
+        cl = (0, 0, 255) if ious[:, t].max() < 0.5 else (0, 255, 0)
+        cv2.rectangle(img, (x1, y1), (x2, y2), cl, 2, cv2.LINE_AA)
+        for k in range(len(anchors)):
+          if ious[k, t] > 0.5:
+            x1, y1, x2, y2 = _coco_box_to_bbox(anchors[k])
+            cl = (np.array([255, 0, 0]) * ious[k, t]).astype(np.int32).tolist()
+            cv2.rectangle(img, (x1, y1), (x2, y2), cl, 1, cv2.LINE_AA)
+      cv2.imshow('img', img)
+      cv2.waitKey()
+    miss = 0
+    if len(ious) > 0:
+      miss = (ious.max(axis=0) < 0.5).sum()
+    cnt += miss
+  print('cnt, obj, ratio ', cnt, obj, cnt / obj)
+  print('s, m, l ', miss_s, miss_m, miss_l)
+    # import pdb; pdb.set_trace()
+
+
+def count_size(split):
+  coco = COCO.COCO(ANN_PATH + ANN_FILES[split])
+  images = coco.getImgIds()
+  cnt = 0
+  obj = 0
+  stride = 16
+  anchor = generate_anchors().reshape(15, 2, 2)
+  cnt_s, cnt_m, cnt_l = 0, 0, 0
+  N = len(images)
+  print(N, 'images')
+  for ind, img_id in enumerate(images):
+    anchors = []
+    ann_ids = coco.getAnnIds(imgIds=[img_id])
+    anns = coco.loadAnns(ids=ann_ids)
+    obj += len(anns)
+    img_info = coco.loadImgs(ids=[img_id])[0]
+    for t in range(len(anns)):
+      if 1:
+        s = anns[t]['area']
+        if s < 32 ** 2:
+          cnt_s += 1
+        elif s < 96 ** 2:
+          cnt_m += 1
+        else:
+          cnt_l += 1
+      cnt += 1
+  print('cnt', cnt)
+  print('s, m, l ', cnt_s, cnt_m, cnt_l)
+ 
+
+# count_iou('train')
+# count_anchor('train')
+# count('train')
+count_size('train')
+
+
+
+
+
diff --git a/src/tools/convert_hourglass_weight.py b/src/tools/convert_hourglass_weight.py
new file mode 100644
index 0000000..9f001b6
--- /dev/null
+++ b/src/tools/convert_hourglass_weight.py
@@ -0,0 +1,30 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+MODEL_PATH = '../../models/ExtremeNet_500000.pkl'
+OUT_PATH = '../../models/ExtremeNet_500000.pth'
+
+import torch
+state_dict = torch.load(MODEL_PATH)
+key_map = {'t_heats': 'hm_t', 'l_heats': 'hm_l', 'b_heats': 'hm_b', \
+           'r_heats': 'hm_r', 'ct_heats': 'hm_c', \
+           't_regrs': 'reg_t', 'l_regrs': 'reg_l', \
+           'b_regrs': 'reg_b', 'r_regrs': 'reg_r'}
+
+out = {}
+for k in state_dict.keys():
+  changed = False
+  for m in key_map.keys():
+    if m in k:
+      if 'ct_heats' in k and m == 't_heats':
+        continue
+      new_k = k.replace(m, key_map[m])
+      out[new_k] = state_dict[k]
+      changed = True
+      print('replace {} to {}'.format(k, new_k))
+  if not changed:
+    out[k] = state_dict[k]
+data = {'epoch': 0,
+        'state_dict': out}
+torch.save(data, OUT_PATH)
diff --git a/src/tools/convert_kitti_to_coco.py b/src/tools/convert_kitti_to_coco.py
new file mode 100644
index 0000000..6ad4dff
--- /dev/null
+++ b/src/tools/convert_kitti_to_coco.py
@@ -0,0 +1,152 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pickle
+import json
+import numpy as np
+import cv2
+DATA_PATH = '../../data/kitti/'
+DEBUG = False
+# VAL_PATH = DATA_PATH + 'training/label_val/'
+import os
+SPLITS = ['3dop', 'subcnn'] 
+import _init_paths
+from utils.ddd_utils import compute_box_3d, project_to_image, alpha2rot_y
+from utils.ddd_utils import draw_box_3d, unproject_2d_to_3d
+
+'''
+#Values    Name      Description
+----------------------------------------------------------------------------
+   1    type         Describes the type of object: 'Car', 'Van', 'Truck',
+                     'Pedestrian', 'Person_sitting', 'Cyclist', 'Tram',
+                     'Misc' or 'DontCare'
+   1    truncated    Float from 0 (non-truncated) to 1 (truncated), where
+                     truncated refers to the object leaving image boundaries
+   1    occluded     Integer (0,1,2,3) indicating occlusion state:
+                     0 = fully visible, 1 = partly occluded
+                     2 = largely occluded, 3 = unknown
+   1    alpha        Observation angle of object, ranging [-pi..pi]
+   4    bbox         2D bounding box of object in the image (0-based index):
+                     contains left, top, right, bottom pixel coordinates
+   3    dimensions   3D object dimensions: height, width, length (in meters)
+   3    location     3D object location x,y,z in camera coordinates (in meters)
+   1    rotation_y   Rotation ry around Y-axis in camera coordinates [-pi..pi]
+   1    score        Only for results: Float, indicating confidence in
+                     detection, needed for p/r curves, higher is better.
+'''
+
+def _bbox_to_coco_bbox(bbox):
+  return [(bbox[0]), (bbox[1]),
+          (bbox[2] - bbox[0]), (bbox[3] - bbox[1])]
+
+def read_clib(calib_path):
+  f = open(calib_path, 'r')
+  for i, line in enumerate(f):
+    if i == 2:
+      calib = np.array(line[:-1].split(' ')[1:], dtype=np.float32)
+      calib = calib.reshape(3, 4)
+      return calib
+
+cats = ['Pedestrian', 'Car', 'Cyclist', 'Van', 'Truck',  'Person_sitting',
+        'Tram', 'Misc', 'DontCare']
+cat_ids = {cat: i + 1 for i, cat in enumerate(cats)}
+# cat_info = [{"name": "pedestrian", "id": 1}, {"name": "vehicle", "id": 2}]
+F = 721
+H = 384 # 375
+W = 1248 # 1242
+EXT = [45.75, -0.34, 0.005]
+CALIB = np.array([[F, 0, W / 2, EXT[0]], [0, F, H / 2, EXT[1]], 
+                  [0, 0, 1, EXT[2]]], dtype=np.float32)
+
+cat_info = []
+for i, cat in enumerate(cats):
+  cat_info.append({'name': cat, 'id': i + 1})
+
+for SPLIT in SPLITS:
+  image_set_path = DATA_PATH + 'ImageSets_{}/'.format(SPLIT)
+  ann_dir = DATA_PATH + 'training/label_2/'
+  calib_dir = DATA_PATH + '{}/calib/'
+  splits = ['train', 'val']
+  # splits = ['trainval', 'test']
+  calib_type = {'train': 'training', 'val': 'training', 'trainval': 'training',
+                'test': 'testing'}
+
+  for split in splits:
+    ret = {'images': [], 'annotations': [], "categories": cat_info}
+    image_set = open(image_set_path + '{}.txt'.format(split), 'r')
+    image_to_id = {}
+    for line in image_set:
+      if line[-1] == '\n':
+        line = line[:-1]
+      image_id = int(line)
+      calib_path = calib_dir.format(calib_type[split]) + '{}.txt'.format(line)
+      calib = read_clib(calib_path)
+      image_info = {'file_name': '{}.png'.format(line),
+                    'id': int(image_id),
+                    'calib': calib.tolist()}
+      ret['images'].append(image_info)
+      if split == 'test':
+        continue
+      ann_path = ann_dir + '{}.txt'.format(line)
+      # if split == 'val':
+      #   os.system('cp {} {}/'.format(ann_path, VAL_PATH))
+      anns = open(ann_path, 'r')
+      
+      if DEBUG:
+        image = cv2.imread(
+          DATA_PATH + 'images/trainval/' + image_info['file_name'])
+
+      for ann_ind, txt in enumerate(anns):
+        tmp = txt[:-1].split(' ')
+        cat_id = cat_ids[tmp[0]]
+        truncated = int(float(tmp[1]))
+        occluded = int(tmp[2])
+        alpha = float(tmp[3])
+        bbox = [float(tmp[4]), float(tmp[5]), float(tmp[6]), float(tmp[7])]
+        dim = [float(tmp[8]), float(tmp[9]), float(tmp[10])]
+        location = [float(tmp[11]), float(tmp[12]), float(tmp[13])]
+        rotation_y = float(tmp[14])
+
+        ann = {'image_id': image_id,
+               'id': int(len(ret['annotations']) + 1),
+               'category_id': cat_id,
+               'dim': dim,
+               'bbox': _bbox_to_coco_bbox(bbox),
+               'depth': location[2],
+               'alpha': alpha,
+               'truncated': truncated,
+               'occluded': occluded,
+               'location': location,
+               'rotation_y': rotation_y}
+        ret['annotations'].append(ann)
+        if DEBUG and tmp[0] != 'DontCare':
+          box_3d = compute_box_3d(dim, location, rotation_y)
+          box_2d = project_to_image(box_3d, calib)
+          # print('box_2d', box_2d)
+          image = draw_box_3d(image, box_2d)
+          x = (bbox[0] + bbox[2]) / 2
+          '''
+          print('rot_y, alpha2rot_y, dlt', tmp[0], 
+                rotation_y, alpha2rot_y(alpha, x, calib[0, 2], calib[0, 0]),
+                np.cos(
+                  rotation_y - alpha2rot_y(alpha, x, calib[0, 2], calib[0, 0])))
+          '''
+          depth = np.array([location[2]], dtype=np.float32)
+          pt_2d = np.array([(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
+                            dtype=np.float32)
+          pt_3d = unproject_2d_to_3d(pt_2d, depth, calib)
+          pt_3d[1] += dim[0] / 2
+          print('pt_3d', pt_3d)
+          print('location', location)
+      if DEBUG:
+        cv2.imshow('image', image)
+        cv2.waitKey()
+
+
+    print("# images: ", len(ret['images']))
+    print("# annotations: ", len(ret['annotations']))
+    # import pdb; pdb.set_trace()
+    out_path = '{}/annotations/kitti_{}_{}.json'.format(DATA_PATH, SPLIT, split)
+    json.dump(ret, open(out_path, 'w'))
+  
diff --git a/src/tools/eval_coco.py b/src/tools/eval_coco.py
new file mode 100644
index 0000000..4a7266b
--- /dev/null
+++ b/src/tools/eval_coco.py
@@ -0,0 +1,27 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pycocotools.coco as coco
+from pycocotools.cocoeval import COCOeval
+import sys
+import cv2
+import numpy as np
+import pickle
+import os
+
+this_dir = os.path.dirname(__file__)
+ANN_PATH = this_dir + '../../data/coco/annotations/instances_val2017.json'
+print(ANN_PATH)
+if __name__ == '__main__':
+  pred_path = sys.argv[1]
+  coco = coco.COCO(ANN_PATH)
+  dets = coco.loadRes(pred_path)
+  img_ids = coco.getImgIds()
+  num_images = len(img_ids)
+  coco_eval = COCOeval(coco, dets, "bbox")
+  coco_eval.evaluate()
+  coco_eval.accumulate()
+  coco_eval.summarize()
+
+  
diff --git a/src/tools/eval_coco_hp.py b/src/tools/eval_coco_hp.py
new file mode 100644
index 0000000..3711e04
--- /dev/null
+++ b/src/tools/eval_coco_hp.py
@@ -0,0 +1,30 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pycocotools.coco as coco
+from pycocotools.cocoeval import COCOeval
+import sys
+import cv2
+import numpy as np
+import pickle
+import os
+
+this_dir = os.path.dirname(__file__)
+ANN_PATH = this_dir + '../../data/coco/annotations/person_keypoints_val2017.json'
+print(ANN_PATH)
+if __name__ == '__main__':
+  pred_path = sys.argv[1]
+  coco = coco.COCO(ANN_PATH)
+  dets = coco.loadRes(pred_path)
+  img_ids = coco.getImgIds()
+  num_images = len(img_ids)
+  coco_eval = COCOeval(coco, dets, "keypoints")
+  coco_eval.evaluate()
+  coco_eval.accumulate()
+  coco_eval.summarize()
+  coco_eval = COCOeval(coco, dets, "bbox")
+  coco_eval.evaluate()
+  coco_eval.accumulate()
+  coco_eval.summarize()
+  
diff --git a/src/tools/get_kitti.sh b/src/tools/get_kitti.sh
new file mode 100644
index 0000000..8497173
--- /dev/null
+++ b/src/tools/get_kitti.sh
@@ -0,0 +1,9 @@
+mkdir kitti
+cd kitti
+wget http://www.cvlibs.net/download.php?file=data_object_image_2.zip
+wget http://www.cvlibs.net/download.php?file=data_object_label_2.zip
+wget http://www.cvlibs.net/download.php?file=data_object_calib.zip
+unzip data_object_image_2.zip
+unzip data_object_label_2.zip
+unzip data_object_calib.zip
+
diff --git a/src/tools/get_pascal_voc.sh b/src/tools/get_pascal_voc.sh
new file mode 100644
index 0000000..02d9b3c
--- /dev/null
+++ b/src/tools/get_pascal_voc.sh
@@ -0,0 +1,26 @@
+mkdir voc
+cd voc
+wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
+wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
+wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCdevkit_08-Jun-2007.tar
+wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
+wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCdevkit_18-May-2011.tar
+tar xvf VOCtrainval_06-Nov-2007.tar
+tar xvf VOCtest_06-Nov-2007.tar
+tar xvf VOCdevkit_08-Jun-2007.tar
+tar xvf VOCtrainval_11-May-2012.tar
+tar xvf VOCdevkit_18-May-2011.tar
+rm VOCtrainval_06-Nov-2007.tar
+rm VOCtest_06-Nov-2007.tar
+rm VOCdevkit_08-Jun-2007.tar
+rm VOCtrainval_11-May-2012.tar
+rm VOCdevkit_18-May-2011.tar
+mkdir images
+cp VOCdevkit/VOC2007/JPEGImages/* images/
+cp VOCdevkit/VOC2012/JPEGImages/* images/
+wget https://storage.googleapis.com/coco-dataset/external/PASCAL_VOC.zip
+unzip PASCAL_VOC.zip
+rm PASCAL_VOC.zip
+mv PASCAL_VOC annotations/
+cd ..
+python merge_pascal_json.py
diff --git a/src/tools/kitti_eval/README.md b/src/tools/kitti_eval/README.md
new file mode 100644
index 0000000..ee07b65
--- /dev/null
+++ b/src/tools/kitti_eval/README.md
@@ -0,0 +1,22 @@
+# kitti_eval
+
+`evaluate_object_3d_offline.cpp`evaluates your KITTI detection locally on your own computer using your validation data selected from KITTI training dataset, with the following metrics:
+
+- overlap on image (AP)
+- oriented overlap on image (AOS)
+- overlap on ground-plane (AP)
+- overlap in 3D (AP)
+
+Compile `evaluate_object_3d_offline.cpp` with dependency of Boost and Linux `dirent.h` (You should already have it under most Linux).
+
+Run the evalutaion by:
+
+    ./evaluate_object_3d_offline groundtruth_dir result_dir
+    
+Note that you don't have to detect over all KITTI training data. The evaluator only evaluates samples whose result files exist.
+
+
+### Updates
+
+- June, 2017:
+  * Fixed the bug of detection box filtering based on min height according to KITTI's note on 25.04.2017.
diff --git a/src/tools/kitti_eval/evaluate_object_3d.cpp b/src/tools/kitti_eval/evaluate_object_3d.cpp
new file mode 100644
index 0000000..aa990dd
--- /dev/null
+++ b/src/tools/kitti_eval/evaluate_object_3d.cpp
@@ -0,0 +1,921 @@
+// from https://github.com/prclibo/kitti_eval
+#include <iostream>
+#include <algorithm>
+#include <stdio.h>
+#include <math.h>
+#include <vector>
+#include <numeric>
+#include <strings.h>
+#include <assert.h>
+
+#include <dirent.h>
+
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+#include <boost/geometry.hpp>
+#include <boost/geometry/geometries/point_xy.hpp>
+#include <boost/geometry/geometries/polygon.hpp>
+#include <boost/geometry/geometries/adapted/c_array.hpp>
+
+#include "mail.h"
+
+BOOST_GEOMETRY_REGISTER_C_ARRAY_CS(cs::cartesian)
+
+typedef boost::geometry::model::polygon<boost::geometry::model::d2::point_xy<double> > Polygon;
+
+
+using namespace std;
+
+/*=======================================================================
+STATIC EVALUATION PARAMETERS
+=======================================================================*/
+
+// holds the number of test images on the server
+const int32_t N_TESTIMAGES = 7518;
+
+// easy, moderate and hard evaluation level
+enum DIFFICULTY{EASY=0, MODERATE=1, HARD=2};
+
+// evaluation metrics: image, ground or 3D
+enum METRIC{IMAGE=0, GROUND=1, BOX3D=2};
+
+// evaluation parameter
+const int32_t MIN_HEIGHT[3]     = {40, 25, 25};     // minimum height for evaluated groundtruth/detections
+const int32_t MAX_OCCLUSION[3]  = {0, 1, 2};        // maximum occlusion level of the groundtruth used for evaluation
+const double  MAX_TRUNCATION[3] = {0.15, 0.3, 0.5}; // maximum truncation level of the groundtruth used for evaluation
+
+// evaluated object classes
+enum CLASSES{CAR=0, PEDESTRIAN=1, CYCLIST=2};
+const int NUM_CLASS = 3;
+
+// parameters varying per class
+vector<string> CLASS_NAMES;
+// the minimum overlap required for 2D evaluation on the image/ground plane and 3D evaluation
+const double MIN_OVERLAP[3][3] = {{0.7, 0.5, 0.5}, {0.5, 0.25, 0.25}, {0.5, 0.25, 0.25}};
+
+// no. of recall steps that should be evaluated (discretized)
+const double N_SAMPLE_PTS = 41;
+
+
+// initialize class names
+void initGlobals () {
+  CLASS_NAMES.push_back("car");
+  CLASS_NAMES.push_back("pedestrian");
+  CLASS_NAMES.push_back("cyclist");
+}
+
+/*=======================================================================
+DATA TYPES FOR EVALUATION
+=======================================================================*/
+
+// holding data needed for precision-recall and precision-aos
+struct tPrData {
+  vector<double> v;           // detection score for computing score thresholds
+  double         similarity;  // orientation similarity
+  int32_t        tp;          // true positives
+  int32_t        fp;          // false positives
+  int32_t        fn;          // false negatives
+  tPrData () :
+    similarity(0), tp(0), fp(0), fn(0) {}
+};
+
+// holding bounding boxes for ground truth and detections
+struct tBox {
+  string  type;     // object type as car, pedestrian or cyclist,...
+  double   x1;      // left corner
+  double   y1;      // top corner
+  double   x2;      // right corner
+  double   y2;      // bottom corner
+  double   alpha;   // image orientation
+  tBox (string type, double x1,double y1,double x2,double y2,double alpha) :
+    type(type),x1(x1),y1(y1),x2(x2),y2(y2),alpha(alpha) {}
+};
+
+// holding ground truth data
+struct tGroundtruth {
+  tBox    box;        // object type, box, orientation
+  double  truncation; // truncation 0..1
+  int32_t occlusion;  // occlusion 0,1,2 (non, partly, fully)
+  double ry;
+  double  t1, t2, t3;
+  double h, w, l;
+  tGroundtruth () :
+    box(tBox("invalild",-1,-1,-1,-1,-10)),truncation(-1),occlusion(-1) {}
+  tGroundtruth (tBox box,double truncation,int32_t occlusion) :
+    box(box),truncation(truncation),occlusion(occlusion) {}
+  tGroundtruth (string type,double x1,double y1,double x2,double y2,double alpha,double truncation,int32_t occlusion) :
+    box(tBox(type,x1,y1,x2,y2,alpha)),truncation(truncation),occlusion(occlusion) {}
+};
+
+// holding detection data
+struct tDetection {
+  tBox    box;    // object type, box, orientation
+  double  thresh; // detection score
+  double  ry;
+  double  t1, t2, t3;
+  double  h, w, l;
+  tDetection ():
+    box(tBox("invalid",-1,-1,-1,-1,-10)),thresh(-1000) {}
+  tDetection (tBox box,double thresh) :
+    box(box),thresh(thresh) {}
+  tDetection (string type,double x1,double y1,double x2,double y2,double alpha,double thresh) :
+    box(tBox(type,x1,y1,x2,y2,alpha)),thresh(thresh) {}
+};
+
+
+/*=======================================================================
+FUNCTIONS TO LOAD DETECTION AND GROUND TRUTH DATA ONCE, SAVE RESULTS
+=======================================================================*/
+vector<int32_t> indices;
+
+vector<tDetection> loadDetections(string file_name, bool &compute_aos,
+        vector<bool> &eval_image, vector<bool> &eval_ground,
+        vector<bool> &eval_3d, bool &success) {
+
+  // holds all detections (ignored detections are indicated by an index vector
+  vector<tDetection> detections;
+  FILE *fp = fopen(file_name.c_str(),"r");
+  if (!fp) {
+    success = false;
+    return detections;
+  }
+  while (!feof(fp)) {
+    tDetection d;
+    double trash;
+    char str[255];
+    if (fscanf(fp, "%s %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf",
+                   str, &trash, &trash, &d.box.alpha, &d.box.x1, &d.box.y1,
+                   &d.box.x2, &d.box.y2, &d.h, &d.w, &d.l, &d.t1, &d.t2, &d.t3,
+                   &d.ry, &d.thresh)==16) {
+
+        // d.thresh = 1;
+      d.box.type = str;
+      detections.push_back(d);
+
+      // orientation=-10 is invalid, AOS is not evaluated if at least one orientation is invalid
+      if(d.box.alpha == -10)
+        compute_aos = false;
+
+      // a class is only evaluated if it is detected at least once
+      for (int c = 0; c < NUM_CLASS; c++) {
+        if (!strcasecmp(d.box.type.c_str(), CLASS_NAMES[c].c_str())) {
+          if (!eval_image[c] && d.box.x1 >= 0)
+            eval_image[c] = true;
+          if (!eval_ground[c] && d.t1 != -1000)
+            eval_ground[c] = true;
+          if (!eval_3d[c] && d.t2 != -1000)
+            eval_3d[c] = true;
+          break;
+        }
+      }
+    }
+  }
+  fclose(fp);
+  success = true;
+  return detections;
+}
+
+vector<tGroundtruth> loadGroundtruth(string file_name,bool &success) {
+
+  // holds all ground truth (ignored ground truth is indicated by an index vector
+  vector<tGroundtruth> groundtruth;
+  FILE *fp = fopen(file_name.c_str(),"r");
+  if (!fp) {
+    success = false;
+    return groundtruth;
+  }
+  while (!feof(fp)) {
+    tGroundtruth g;
+    char str[255];
+    if (fscanf(fp, "%s %lf %d %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf",
+                   str, &g.truncation, &g.occlusion, &g.box.alpha,
+                   &g.box.x1,   &g.box.y1,     &g.box.x2,    &g.box.y2,
+                   &g.h,      &g.w,        &g.l,       &g.t1,
+                   &g.t2,      &g.t3,        &g.ry )==15) {
+      g.box.type = str;
+      groundtruth.push_back(g);
+    }
+  }
+  fclose(fp);
+  success = true;
+  return groundtruth;
+}
+
+void saveStats (const vector<double> &precision, const vector<double> &aos, FILE *fp_det, FILE *fp_ori) {
+
+  // save precision to file
+  if(precision.empty())
+    return;
+  for (int32_t i=0; i<precision.size(); i++)
+    fprintf(fp_det,"%f ",precision[i]);
+  fprintf(fp_det,"\n");
+
+  // save orientation similarity, only if there were no invalid orientation entries in submission (alpha=-10)
+  if(aos.empty())
+    return;
+  for (int32_t i=0; i<aos.size(); i++)
+    fprintf(fp_ori,"%f ",aos[i]);
+  fprintf(fp_ori,"\n");
+}
+
+/*=======================================================================
+EVALUATION HELPER FUNCTIONS
+=======================================================================*/
+
+// criterion defines whether the overlap is computed with respect to both areas (ground truth and detection)
+// or with respect to box a or b (detection and "dontcare" areas)
+inline double imageBoxOverlap(tBox a, tBox b, int32_t criterion=-1){
+
+  // overlap is invalid in the beginning
+  double o = -1;
+
+  // get overlapping area
+  double x1 = max(a.x1, b.x1);
+  double y1 = max(a.y1, b.y1);
+  double x2 = min(a.x2, b.x2);
+  double y2 = min(a.y2, b.y2);
+
+  // compute width and height of overlapping area
+  double w = x2-x1;
+  double h = y2-y1;
+
+  // set invalid entries to 0 overlap
+  if(w<=0 || h<=0)
+    return 0;
+
+  // get overlapping areas
+  double inter = w*h;
+  double a_area = (a.x2-a.x1) * (a.y2-a.y1);
+  double b_area = (b.x2-b.x1) * (b.y2-b.y1);
+
+  // intersection over union overlap depending on users choice
+  if(criterion==-1)     // union
+    o = inter / (a_area+b_area-inter);
+  else if(criterion==0) // bbox_a
+    o = inter / a_area;
+  else if(criterion==1) // bbox_b
+    o = inter / b_area;
+
+  // overlap
+  return o;
+}
+
+inline double imageBoxOverlap(tDetection a, tGroundtruth b, int32_t criterion=-1){
+  return imageBoxOverlap(a.box, b.box, criterion);
+}
+
+// compute polygon of an oriented bounding box
+template <typename T>
+Polygon toPolygon(const T& g) {
+    using namespace boost::numeric::ublas;
+    using namespace boost::geometry;
+    matrix<double> mref(2, 2);
+    mref(0, 0) = cos(g.ry); mref(0, 1) = sin(g.ry);
+    mref(1, 0) = -sin(g.ry); mref(1, 1) = cos(g.ry);
+
+    static int count = 0;
+    matrix<double> corners(2, 4);
+    double data[] = {g.l / 2, g.l / 2, -g.l / 2, -g.l / 2,
+                     g.w / 2, -g.w / 2, -g.w / 2, g.w / 2};
+    std::copy(data, data + 8, corners.data().begin());
+    matrix<double> gc = prod(mref, corners);
+    for (int i = 0; i < 4; ++i) {
+        gc(0, i) += g.t1;
+        gc(1, i) += g.t3;
+    }
+
+    double points[][2] = {{gc(0, 0), gc(1, 0)},{gc(0, 1), gc(1, 1)},{gc(0, 2), gc(1, 2)},{gc(0, 3), gc(1, 3)},{gc(0, 0), gc(1, 0)}};
+    Polygon poly;
+    append(poly, points);
+    return poly;
+}
+
+// measure overlap between bird's eye view bounding boxes, parametrized by (ry, l, w, tx, tz)
+inline double groundBoxOverlap(tDetection d, tGroundtruth g, int32_t criterion = -1) {
+    using namespace boost::geometry;
+    Polygon gp = toPolygon(g);
+    Polygon dp = toPolygon(d);
+
+    std::vector<Polygon> in, un;
+    intersection(gp, dp, in);
+    union_(gp, dp, un);
+
+    double inter_area = in.empty() ? 0 : area(in.front());
+    double union_area = area(un.front());
+    double o;
+    if(criterion==-1)     // union
+        o = inter_area / union_area;
+    else if(criterion==0) // bbox_a
+        o = inter_area / area(dp);
+    else if(criterion==1) // bbox_b
+        o = inter_area / area(gp);
+
+    return o;
+}
+
+// measure overlap between 3D bounding boxes, parametrized by (ry, h, w, l, tx, ty, tz)
+inline double box3DOverlap(tDetection d, tGroundtruth g, int32_t criterion = -1) {
+    using namespace boost::geometry;
+    Polygon gp = toPolygon(g);
+    Polygon dp = toPolygon(d);
+
+    std::vector<Polygon> in, un;
+    intersection(gp, dp, in);
+    union_(gp, dp, un);
+
+    double ymax = min(d.t2, g.t2);
+    double ymin = max(d.t2 - d.h, g.t2 - g.h);
+
+    double inter_area = in.empty() ? 0 : area(in.front());
+    double inter_vol = inter_area * max(0.0, ymax - ymin);
+
+    double det_vol = d.h * d.l * d.w;
+    double gt_vol = g.h * g.l * g.w;
+
+    double o;
+    if(criterion==-1)     // union
+        o = inter_vol / (det_vol + gt_vol - inter_vol);
+    else if(criterion==0) // bbox_a
+        o = inter_vol / det_vol;
+    else if(criterion==1) // bbox_b
+        o = inter_vol / gt_vol;
+
+    return o;
+}
+
+vector<double> getThresholds(vector<double> &v, double n_groundtruth){
+
+  // holds scores needed to compute N_SAMPLE_PTS recall values
+  vector<double> t;
+
+  // sort scores in descending order
+  // (highest score is assumed to give best/most confident detections)
+  sort(v.begin(), v.end(), greater<double>());
+
+  // get scores for linearly spaced recall
+  double current_recall = 0;
+  for(int32_t i=0; i<v.size(); i++){
+
+    // check if right-hand-side recall with respect to current recall is close than left-hand-side one
+    // in this case, skip the current detection score
+    double l_recall, r_recall, recall;
+    l_recall = (double)(i+1)/n_groundtruth;
+    if(i<(v.size()-1))
+      r_recall = (double)(i+2)/n_groundtruth;
+    else
+      r_recall = l_recall;
+
+    if( (r_recall-current_recall) < (current_recall-l_recall) && i<(v.size()-1))
+      continue;
+
+    // left recall is the best approximation, so use this and goto next recall step for approximation
+    recall = l_recall;
+
+    // the next recall step was reached
+    t.push_back(v[i]);
+    current_recall += 1.0/(N_SAMPLE_PTS-1.0);
+  }
+  return t;
+}
+
+void cleanData(CLASSES current_class, const vector<tGroundtruth> &gt, const vector<tDetection> &det, vector<int32_t> &ignored_gt, vector<tGroundtruth> &dc, vector<int32_t> &ignored_det, int32_t &n_gt, DIFFICULTY difficulty){
+
+  // extract ground truth bounding boxes for current evaluation class
+  for(int32_t i=0;i<gt.size(); i++){
+
+    // only bounding boxes with a minimum height are used for evaluation
+    double height = gt[i].box.y2 - gt[i].box.y1;
+
+    // neighboring classes are ignored ("van" for "car" and "person_sitting" for "pedestrian")
+    // (lower/upper cases are ignored)
+    int32_t valid_class;
+
+    // all classes without a neighboring class
+    if(!strcasecmp(gt[i].box.type.c_str(), CLASS_NAMES[current_class].c_str()))
+      valid_class = 1;
+
+    // classes with a neighboring class
+    else if(!strcasecmp(CLASS_NAMES[current_class].c_str(), "Pedestrian") && !strcasecmp("Person_sitting", gt[i].box.type.c_str()))
+      valid_class = 0;
+    else if(!strcasecmp(CLASS_NAMES[current_class].c_str(), "Car") && !strcasecmp("Van", gt[i].box.type.c_str()))
+      valid_class = 0;
+
+    // classes not used for evaluation
+    else
+      valid_class = -1;
+
+    // ground truth is ignored, if occlusion, truncation exceeds the difficulty or ground truth is too small
+    // (doesn't count as FN nor TP, although detections may be assigned)
+    bool ignore = false;
+    if(gt[i].occlusion>MAX_OCCLUSION[difficulty] || gt[i].truncation>MAX_TRUNCATION[difficulty] || height<MIN_HEIGHT[difficulty])
+      ignore = true;
+
+    // set ignored vector for ground truth
+    // current class and not ignored (total no. of ground truth is detected for recall denominator)
+    if(valid_class==1 && !ignore){
+      ignored_gt.push_back(0);
+      n_gt++;
+    }
+
+    // neighboring class, or current class but ignored
+    else if(valid_class==0 || (ignore && valid_class==1))
+      ignored_gt.push_back(1);
+
+    // all other classes which are FN in the evaluation
+    else
+      ignored_gt.push_back(-1);
+  }
+
+  // extract dontcare areas
+  for(int32_t i=0;i<gt.size(); i++)
+    if(!strcasecmp("DontCare", gt[i].box.type.c_str()))
+      dc.push_back(gt[i]);
+
+  // extract detections bounding boxes of the current class
+  for(int32_t i=0;i<det.size(); i++){
+
+    // neighboring classes are not evaluated
+    int32_t valid_class;
+    if(!strcasecmp(det[i].box.type.c_str(), CLASS_NAMES[current_class].c_str()))
+      valid_class = 1;
+    else
+      valid_class = -1;
+
+    int32_t height = fabs(det[i].box.y1 - det[i].box.y2);
+    // set ignored vector for detections
+    if(height<MIN_HEIGHT[difficulty])
+      ignored_det.push_back(1);
+    else if(valid_class==1)
+      ignored_det.push_back(0);
+    else
+      ignored_det.push_back(-1);
+  }
+}
+
+tPrData computeStatistics(CLASSES current_class, const vector<tGroundtruth> &gt,
+        const vector<tDetection> &det, const vector<tGroundtruth> &dc,
+        const vector<int32_t> &ignored_gt, const vector<int32_t>  &ignored_det,
+        bool compute_fp, double (*boxoverlap)(tDetection, tGroundtruth, int32_t),
+        METRIC metric, bool compute_aos=false, double thresh=0, bool debug=false){
+
+  tPrData stat = tPrData();
+  const double NO_DETECTION = -10000000;
+  vector<double> delta;            // holds angular difference for TPs (needed for AOS evaluation)
+  vector<bool> assigned_detection; // holds wether a detection was assigned to a valid or ignored ground truth
+  assigned_detection.assign(det.size(), false);
+  vector<bool> ignored_threshold;
+  ignored_threshold.assign(det.size(), false); // holds detections with a threshold lower than thresh if FP are computed
+
+  // detections with a low score are ignored for computing precision (needs FP)
+  if(compute_fp)
+    for(int32_t i=0; i<det.size(); i++)
+      if(det[i].thresh<thresh)
+        ignored_threshold[i] = true;
+
+  // evaluate all ground truth boxes
+  for(int32_t i=0; i<gt.size(); i++){
+
+    // this ground truth is not of the current or a neighboring class and therefore ignored
+    if(ignored_gt[i]==-1)
+      continue;
+
+    /*=======================================================================
+    find candidates (overlap with ground truth > 0.5) (logical len(det))
+    =======================================================================*/
+    int32_t det_idx          = -1;
+    double valid_detection = NO_DETECTION;
+    double max_overlap     = 0;
+
+    // search for a possible detection
+    bool assigned_ignored_det = false;
+    for(int32_t j=0; j<det.size(); j++){
+
+      // detections not of the current class, already assigned or with a low threshold are ignored
+      if(ignored_det[j]==-1)
+        continue;
+      if(assigned_detection[j])
+        continue;
+      if(ignored_threshold[j])
+        continue;
+
+      // find the maximum score for the candidates and get idx of respective detection
+      double overlap = boxoverlap(det[j], gt[i], -1);
+
+      // for computing recall thresholds, the candidate with highest score is considered
+      if(!compute_fp && overlap>MIN_OVERLAP[metric][current_class] && det[j].thresh>valid_detection){
+        det_idx         = j;
+        valid_detection = det[j].thresh;
+      }
+
+      // for computing pr curve values, the candidate with the greatest overlap is considered
+      // if the greatest overlap is an ignored detection (min_height), the overlapping detection is used
+      else if(compute_fp && overlap>MIN_OVERLAP[metric][current_class] && (overlap>max_overlap || assigned_ignored_det) && ignored_det[j]==0){
+        max_overlap     = overlap;
+        det_idx         = j;
+        valid_detection = 1;
+        assigned_ignored_det = false;
+      }
+      else if(compute_fp && overlap>MIN_OVERLAP[metric][current_class] && valid_detection==NO_DETECTION && ignored_det[j]==1){
+        det_idx              = j;
+        valid_detection      = 1;
+        assigned_ignored_det = true;
+      }
+    }
+
+    /*=======================================================================
+    compute TP, FP and FN
+    =======================================================================*/
+
+    // nothing was assigned to this valid ground truth
+    if(valid_detection==NO_DETECTION && ignored_gt[i]==0) {
+      stat.fn++;
+    }
+
+    // only evaluate valid ground truth <=> detection assignments (considering difficulty level)
+    else if(valid_detection!=NO_DETECTION && (ignored_gt[i]==1 || ignored_det[det_idx]==1))
+      assigned_detection[det_idx] = true;
+
+    // found a valid true positive
+    else if(valid_detection!=NO_DETECTION){
+
+      // write highest score to threshold vector
+      stat.tp++;
+      stat.v.push_back(det[det_idx].thresh);
+
+      // compute angular difference of detection and ground truth if valid detection orientation was provided
+      if(compute_aos)
+        delta.push_back(gt[i].box.alpha - det[det_idx].box.alpha);
+
+      // clean up
+      assigned_detection[det_idx] = true;
+    }
+  }
+
+  // if FP are requested, consider stuff area
+  if(compute_fp){
+
+    // count fp
+    for(int32_t i=0; i<det.size(); i++){
+
+      // count false positives if required (height smaller than required is ignored (ignored_det==1)
+      if(!(assigned_detection[i] || ignored_det[i]==-1 || ignored_det[i]==1 || ignored_threshold[i]))
+        stat.fp++;
+    }
+
+    // do not consider detections overlapping with stuff area
+    int32_t nstuff = 0;
+    for(int32_t i=0; i<dc.size(); i++){
+      for(int32_t j=0; j<det.size(); j++){
+
+        // detections not of the current class, already assigned, with a low threshold or a low minimum height are ignored
+        if(assigned_detection[j])
+          continue;
+        if(ignored_det[j]==-1 || ignored_det[j]==1)
+          continue;
+        if(ignored_threshold[j])
+          continue;
+
+        // compute overlap and assign to stuff area, if overlap exceeds class specific value
+        double overlap = boxoverlap(det[j], dc[i], 0);
+        if(overlap>MIN_OVERLAP[metric][current_class]){
+          assigned_detection[j] = true;
+          nstuff++;
+        }
+      }
+    }
+
+    // FP = no. of all not to ground truth assigned detections - detections assigned to stuff areas
+    stat.fp -= nstuff;
+
+    // if all orientation values are valid, the AOS is computed
+    if(compute_aos){
+      vector<double> tmp;
+
+      // FP have a similarity of 0, for all TP compute AOS
+      tmp.assign(stat.fp, 0);
+      for(int32_t i=0; i<delta.size(); i++)
+        tmp.push_back((1.0+cos(delta[i]))/2.0);
+
+      // be sure, that all orientation deltas are computed
+      assert(tmp.size()==stat.fp+stat.tp);
+      assert(delta.size()==stat.tp);
+
+      // get the mean orientation similarity for this image
+      if(stat.tp>0 || stat.fp>0)
+        stat.similarity = accumulate(tmp.begin(), tmp.end(), 0.0);
+
+      // there was neither a FP nor a TP, so the similarity is ignored in the evaluation
+      else
+        stat.similarity = -1;
+    }
+  }
+  return stat;
+}
+
+/*=======================================================================
+EVALUATE CLASS-WISE
+=======================================================================*/
+
+bool eval_class (FILE *fp_det, FILE *fp_ori, CLASSES current_class,
+        const vector< vector<tGroundtruth> > &groundtruth,
+        const vector< vector<tDetection> > &detections, bool compute_aos,
+        double (*boxoverlap)(tDetection, tGroundtruth, int32_t),
+        vector<double> &precision, vector<double> &aos,
+        DIFFICULTY difficulty, METRIC metric) {
+    assert(groundtruth.size() == detections.size());
+
+  // init
+  int32_t n_gt=0;                                     // total no. of gt (denominator of recall)
+  vector<double> v, thresholds;                       // detection scores, evaluated for recall discretization
+  vector< vector<int32_t> > ignored_gt, ignored_det;  // index of ignored gt detection for current class/difficulty
+  vector< vector<tGroundtruth> > dontcare;            // index of dontcare areas, included in ground truth
+
+  // for all test images do
+  for (int32_t i=0; i<groundtruth.size(); i++){
+
+    // holds ignored ground truth, ignored detections and dontcare areas for current frame
+    vector<int32_t> i_gt, i_det;
+    vector<tGroundtruth> dc;
+
+    // only evaluate objects of current class and ignore occluded, truncated objects
+    cleanData(current_class, groundtruth[i], detections[i], i_gt, dc, i_det, n_gt, difficulty);
+    ignored_gt.push_back(i_gt);
+    ignored_det.push_back(i_det);
+    dontcare.push_back(dc);
+
+    // compute statistics to get recall values
+    tPrData pr_tmp = tPrData();
+    pr_tmp = computeStatistics(current_class, groundtruth[i], detections[i], dc, i_gt, i_det, false, boxoverlap, metric);
+
+    // add detection scores to vector over all images
+    for(int32_t j=0; j<pr_tmp.v.size(); j++)
+      v.push_back(pr_tmp.v[j]);
+  }
+
+  // get scores that must be evaluated for recall discretization
+  thresholds = getThresholds(v, n_gt);
+
+  // compute TP,FP,FN for relevant scores
+  vector<tPrData> pr;
+  pr.assign(thresholds.size(),tPrData());
+  for (int32_t i=0; i<groundtruth.size(); i++){
+
+    // for all scores/recall thresholds do:
+    for(int32_t t=0; t<thresholds.size(); t++){
+      tPrData tmp = tPrData();
+      tmp = computeStatistics(current_class, groundtruth[i], detections[i], dontcare[i],
+                              ignored_gt[i], ignored_det[i], true, boxoverlap, metric,
+                              compute_aos, thresholds[t], t==38);
+
+      // add no. of TP, FP, FN, AOS for current frame to total evaluation for current threshold
+      pr[t].tp += tmp.tp;
+      pr[t].fp += tmp.fp;
+      pr[t].fn += tmp.fn;
+      if(tmp.similarity!=-1)
+        pr[t].similarity += tmp.similarity;
+    }
+  }
+
+  // compute recall, precision and AOS
+  vector<double> recall;
+  precision.assign(N_SAMPLE_PTS, 0);
+  if(compute_aos)
+    aos.assign(N_SAMPLE_PTS, 0);
+  double r=0;
+  for (int32_t i=0; i<thresholds.size(); i++){
+    r = pr[i].tp/(double)(pr[i].tp + pr[i].fn);
+    recall.push_back(r);
+    precision[i] = pr[i].tp/(double)(pr[i].tp + pr[i].fp);
+    if(compute_aos)
+      aos[i] = pr[i].similarity/(double)(pr[i].tp + pr[i].fp);
+  }
+
+  // filter precision and AOS using max_{i..end}(precision)
+  for (int32_t i=0; i<thresholds.size(); i++){
+    precision[i] = *max_element(precision.begin()+i, precision.end());
+    if(compute_aos)
+      aos[i] = *max_element(aos.begin()+i, aos.end());
+  }
+
+  // save statisics and finish with success
+  saveStats(precision, aos, fp_det, fp_ori);
+    return true;
+}
+
+void saveAndPlotPlots(string dir_name,string file_name,string obj_type,vector<double> vals[],bool is_aos){
+
+  char command[1024];
+
+  // save plot data to file
+  FILE *fp = fopen((dir_name + "/" + file_name + ".txt").c_str(),"w");
+  printf("save %s\n", (dir_name + "/" + file_name + ".txt").c_str());
+  for (int32_t i=0; i<(int)N_SAMPLE_PTS; i++)
+    fprintf(fp,"%f %f %f %f\n",(double)i/(N_SAMPLE_PTS-1.0),vals[0][i],vals[1][i],vals[2][i]);
+  fclose(fp);
+
+  // create png + eps
+  for (int32_t j=0; j<2; j++) {
+
+    // open file
+    FILE *fp = fopen((dir_name + "/" + file_name + ".gp").c_str(),"w");
+
+    // save gnuplot instructions
+    if (j==0) {
+      fprintf(fp,"set term png size 450,315 font \"Helvetica\" 11\n");
+      fprintf(fp,"set output \"%s.png\"\n",file_name.c_str());
+    } else {
+      fprintf(fp,"set term postscript eps enhanced color font \"Helvetica\" 20\n");
+      fprintf(fp,"set output \"%s.eps\"\n",file_name.c_str());
+    }
+
+    // set labels and ranges
+    fprintf(fp,"set size ratio 0.7\n");
+    fprintf(fp,"set xrange [0:1]\n");
+    fprintf(fp,"set yrange [0:1]\n");
+    fprintf(fp,"set xlabel \"Recall\"\n");
+    if (!is_aos) fprintf(fp,"set ylabel \"Precision\"\n");
+    else         fprintf(fp,"set ylabel \"Orientation Similarity\"\n");
+    obj_type[0] = toupper(obj_type[0]);
+    fprintf(fp,"set title \"%s\"\n",obj_type.c_str());
+
+    // line width
+    int32_t   lw = 5;
+    if (j==0) lw = 3;
+
+    // plot error curve
+    fprintf(fp,"plot ");
+    fprintf(fp,"\"%s.txt\" using 1:2 title 'Easy' with lines ls 1 lw %d,",file_name.c_str(),lw);
+    fprintf(fp,"\"%s.txt\" using 1:3 title 'Moderate' with lines ls 2 lw %d,",file_name.c_str(),lw);
+    fprintf(fp,"\"%s.txt\" using 1:4 title 'Hard' with lines ls 3 lw %d",file_name.c_str(),lw);
+
+    // close file
+    fclose(fp);
+
+    // run gnuplot => create png + eps
+    sprintf(command,"cd %s; gnuplot %s",dir_name.c_str(),(file_name + ".gp").c_str());
+    system(command);
+  }
+
+  // create pdf and crop
+  sprintf(command,"cd %s; ps2pdf %s.eps %s_large.pdf",dir_name.c_str(),file_name.c_str(),file_name.c_str());
+  system(command);
+  sprintf(command,"cd %s; pdfcrop %s_large.pdf %s.pdf",dir_name.c_str(),file_name.c_str(),file_name.c_str());
+  system(command);
+  sprintf(command,"cd %s; rm %s_large.pdf",dir_name.c_str(),file_name.c_str());
+  system(command);
+}
+
+bool eval(string result_sha,Mail* mail){
+
+  // set some global parameters
+  initGlobals();
+
+  // ground truth and result directories
+  string gt_dir         = "data/object/label_2";
+  string result_dir     = "results/" + result_sha;
+  string plot_dir       = result_dir + "/plot";
+
+  // create output directories
+  system(("mkdir " + plot_dir).c_str());
+
+  // hold detections and ground truth in memory
+  vector< vector<tGroundtruth> > groundtruth;
+  vector< vector<tDetection> >   detections;
+
+  // holds wether orientation similarity shall be computed (might be set to false while loading detections)
+  // and which labels where provided by this submission
+  bool compute_aos=true;
+  vector<bool> eval_image(NUM_CLASS, false);
+  vector<bool> eval_ground(NUM_CLASS, false);
+  vector<bool> eval_3d(NUM_CLASS, false);
+
+  // for all images read groundtruth and detections
+  mail->msg("Loading detections...");
+  for (int32_t i=0; i<N_TESTIMAGES; i++) {
+
+    // file name
+    char file_name[256];
+    sprintf(file_name,"%06d.txt",indices.at(i));
+
+    // read ground truth and result poses
+    bool gt_success,det_success;
+    vector<tGroundtruth> gt   = loadGroundtruth(gt_dir + "/" + file_name,gt_success);
+    vector<tDetection>   det  = loadDetections(result_dir + "/data/" + file_name,
+            compute_aos, eval_image, eval_ground, eval_3d, det_success);
+    groundtruth.push_back(gt);
+    detections.push_back(det);
+
+    // check for errors
+    if (!gt_success) {
+      mail->msg("ERROR: Couldn't read: %s of ground truth. Please write me an email!", file_name);
+      return false;
+    }
+    if (!det_success) {
+      mail->msg("ERROR: Couldn't read: %s", file_name);
+      return false;
+    }
+  }
+  mail->msg("  done.");
+
+  // holds pointers for result files
+  FILE *fp_det=0, *fp_ori=0;
+
+  // eval image 2D bounding boxes
+  for (int c = 0; c < NUM_CLASS; c++) {
+    CLASSES cls = (CLASSES)c;
+    if (eval_image[c]) {
+      fp_det = fopen((result_dir + "/stats_" + CLASS_NAMES[c] + "_detection.txt").c_str(), "w");
+      if(compute_aos)
+        fp_ori = fopen((result_dir + "/stats_" + CLASS_NAMES[c] + "_orientation.txt").c_str(),"w");
+      vector<double> precision[3], aos[3];
+      if(   !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, imageBoxOverlap, precision[0], aos[0], EASY, IMAGE)
+         || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, imageBoxOverlap, precision[1], aos[1], MODERATE, IMAGE)
+         || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, imageBoxOverlap, precision[2], aos[2], HARD, IMAGE)) {
+        mail->msg("%s evaluation failed.", CLASS_NAMES[c].c_str());
+        return false;
+      }
+      fclose(fp_det);
+      saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_detection", CLASS_NAMES[c], precision, 0);
+      if(compute_aos){
+        saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_orientation", CLASS_NAMES[c], aos, 1);
+        fclose(fp_ori);
+      }
+    }
+  }
+
+  // don't evaluate AOS for birdview boxes and 3D boxes
+  compute_aos = false;
+
+  // eval bird's eye view bounding boxes
+  for (int c = 0; c < NUM_CLASS; c++) {
+    CLASSES cls = (CLASSES)c;
+    if (eval_ground[c]) {
+      fp_det = fopen((result_dir + "/stats_" + CLASS_NAMES[c] + "_detection_ground.txt").c_str(), "w");
+      vector<double> precision[3], aos[3];
+      if(   !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, groundBoxOverlap, precision[0], aos[0], EASY, GROUND)
+         || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, groundBoxOverlap, precision[1], aos[1], MODERATE, GROUND)
+         || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, groundBoxOverlap, precision[2], aos[2], HARD, GROUND)) {
+        mail->msg("%s evaluation failed.", CLASS_NAMES[c].c_str());
+        return false;
+      }
+      fclose(fp_det);
+      saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_detection_ground", CLASS_NAMES[c], precision, 0);
+    }
+  }
+
+  // eval 3D bounding boxes
+  for (int c = 0; c < NUM_CLASS; c++) {
+    CLASSES cls = (CLASSES)c;
+    if (eval_3d[c]) {
+      fp_det = fopen((result_dir + "/stats_" + CLASS_NAMES[c] + "_detection_3d.txt").c_str(), "w");
+      vector<double> precision[3], aos[3];
+      if(   !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, box3DOverlap, precision[0], aos[0], EASY, BOX3D)
+         || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, box3DOverlap, precision[1], aos[1], MODERATE, BOX3D)
+         || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, box3DOverlap, precision[2], aos[2], HARD, BOX3D)) {
+        mail->msg("%s evaluation failed.", CLASS_NAMES[c].c_str());
+        return false;
+      }
+      fclose(fp_det);
+      saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_detection_3d", CLASS_NAMES[c], precision, 0);
+    }
+  }
+
+  // success
+  return true;
+}
+
+int32_t main (int32_t argc,char *argv[]) {
+
+  // we need 2 or 4 arguments!
+  if (argc!=2 && argc!=4) {
+    cout << "Usage: ./eval_detection result_sha [user_sha email]" << endl;
+    return 1;
+  }
+
+  // read arguments
+  string result_sha = argv[1];
+
+  // init notification mail
+  Mail *mail;
+  if (argc==4) mail = new Mail(argv[3]);
+  else         mail = new Mail();
+  mail->msg("Thank you for participating in our evaluation!");
+
+  // run evaluation
+  if (eval(result_sha,mail)) {
+    mail->msg("Your evaluation results are available at:");
+    mail->msg("http://www.cvlibs.net/datasets/kitti/user_submit_check_login.php?benchmark=object&user=%s&result=%s",argv[2], result_sha.c_str());
+  } else {
+    system(("rm -r results/" + result_sha).c_str());
+    mail->msg("An error occured while processing your results.");
+    mail->msg("Please make sure that the data in your zip archive has the right format!");
+  }
+
+  // send mail and exit
+  delete mail;
+
+  return 0;
+}
+
+
diff --git a/src/tools/kitti_eval/evaluate_object_3d_offline b/src/tools/kitti_eval/evaluate_object_3d_offline
new file mode 100755
index 0000000..c7b42d6
Binary files /dev/null and b/src/tools/kitti_eval/evaluate_object_3d_offline differ
diff --git a/src/tools/kitti_eval/evaluate_object_3d_offline.cpp b/src/tools/kitti_eval/evaluate_object_3d_offline.cpp
new file mode 100644
index 0000000..72e4f80
--- /dev/null
+++ b/src/tools/kitti_eval/evaluate_object_3d_offline.cpp
@@ -0,0 +1,948 @@
+#include <iostream>
+#include <algorithm>
+#include <stdio.h>
+#include <math.h>
+#include <vector>
+#include <numeric>
+#include <strings.h>
+#include <assert.h>
+
+#include <dirent.h>
+
+#include <boost/numeric/ublas/matrix.hpp>
+#include <boost/numeric/ublas/io.hpp>
+
+#include <boost/geometry.hpp>
+#include <boost/geometry/geometries/point_xy.hpp>
+#include <boost/geometry/geometries/polygon.hpp>
+#include <boost/geometry/geometries/adapted/c_array.hpp>
+
+#include "mail.h"
+
+BOOST_GEOMETRY_REGISTER_C_ARRAY_CS(cs::cartesian)
+
+typedef boost::geometry::model::polygon<boost::geometry::model::d2::point_xy<double> > Polygon;
+
+
+using namespace std;
+
+/*=======================================================================
+STATIC EVALUATION PARAMETERS
+=======================================================================*/
+
+// holds the number of test images on the server
+const int32_t N_TESTIMAGES = 7518;
+
+// easy, moderate and hard evaluation level
+enum DIFFICULTY{EASY=0, MODERATE=1, HARD=2};
+
+// evaluation metrics: image, ground or 3D
+enum METRIC{IMAGE=0, GROUND=1, BOX3D=2};
+
+// evaluation parameter
+const int32_t MIN_HEIGHT[3]     = {40, 25, 25};     // minimum height for evaluated groundtruth/detections
+const int32_t MAX_OCCLUSION[3]  = {0, 1, 2};        // maximum occlusion level of the groundtruth used for evaluation
+const double  MAX_TRUNCATION[3] = {0.15, 0.3, 0.5}; // maximum truncation level of the groundtruth used for evaluation
+
+// evaluated object classes
+enum CLASSES{CAR=0, PEDESTRIAN=1, CYCLIST=2};
+const int NUM_CLASS = 3;
+
+// parameters varying per class
+vector<string> CLASS_NAMES;
+// the minimum overlap required for 2D evaluation on the image/ground plane and 3D evaluation
+const double MIN_OVERLAP[3][3] = {{0.7, 0.5, 0.5}, {0.5, 0.25, 0.25}, {0.5, 0.25, 0.25}};
+// const double MIN_OVERLAP[3][3] = {{0.7, 0.5, 0.5}, {0.7, 0.5, 0.5}, {0.7, 0.5, 0.5}};
+
+// no. of recall steps that should be evaluated (discretized)
+const double N_SAMPLE_PTS = 41;
+
+
+// initialize class names
+void initGlobals () {
+  CLASS_NAMES.push_back("car");
+  CLASS_NAMES.push_back("pedestrian");
+  CLASS_NAMES.push_back("cyclist");
+}
+
+/*=======================================================================
+DATA TYPES FOR EVALUATION
+=======================================================================*/
+
+// holding data needed for precision-recall and precision-aos
+struct tPrData {
+  vector<double> v;           // detection score for computing score thresholds
+  double         similarity;  // orientation similarity
+  int32_t        tp;          // true positives
+  int32_t        fp;          // false positives
+  int32_t        fn;          // false negatives
+  tPrData () :
+    similarity(0), tp(0), fp(0), fn(0) {}
+};
+
+// holding bounding boxes for ground truth and detections
+struct tBox {
+  string  type;     // object type as car, pedestrian or cyclist,...
+  double   x1;      // left corner
+  double   y1;      // top corner
+  double   x2;      // right corner
+  double   y2;      // bottom corner
+  double   alpha;   // image orientation
+  tBox (string type, double x1,double y1,double x2,double y2,double alpha) :
+    type(type),x1(x1),y1(y1),x2(x2),y2(y2),alpha(alpha) {}
+};
+
+// holding ground truth data
+struct tGroundtruth {
+  tBox    box;        // object type, box, orientation
+  double  truncation; // truncation 0..1
+  int32_t occlusion;  // occlusion 0,1,2 (non, partly, fully)
+  double ry;
+  double  t1, t2, t3;
+  double h, w, l;
+  tGroundtruth () :
+    box(tBox("invalild",-1,-1,-1,-1,-10)),truncation(-1),occlusion(-1) {}
+  tGroundtruth (tBox box,double truncation,int32_t occlusion) :
+    box(box),truncation(truncation),occlusion(occlusion) {}
+  tGroundtruth (string type,double x1,double y1,double x2,double y2,double alpha,double truncation,int32_t occlusion) :
+    box(tBox(type,x1,y1,x2,y2,alpha)),truncation(truncation),occlusion(occlusion) {}
+};
+
+// holding detection data
+struct tDetection {
+  tBox    box;    // object type, box, orientation
+  double  thresh; // detection score
+  double  ry;
+  double  t1, t2, t3;
+  double  h, w, l;
+  tDetection ():
+    box(tBox("invalid",-1,-1,-1,-1,-10)),thresh(-1000) {}
+  tDetection (tBox box,double thresh) :
+    box(box),thresh(thresh) {}
+  tDetection (string type,double x1,double y1,double x2,double y2,double alpha,double thresh) :
+    box(tBox(type,x1,y1,x2,y2,alpha)),thresh(thresh) {}
+};
+
+
+/*=======================================================================
+FUNCTIONS TO LOAD DETECTION AND GROUND TRUTH DATA ONCE, SAVE RESULTS
+=======================================================================*/
+vector<int32_t> indices;
+
+vector<tDetection> loadDetections(string file_name, bool &compute_aos,
+        vector<bool> &eval_image, vector<bool> &eval_ground,
+        vector<bool> &eval_3d, bool &success) {
+
+  // holds all detections (ignored detections are indicated by an index vector
+  vector<tDetection> detections;
+  FILE *fp = fopen(file_name.c_str(),"r");
+  if (!fp) {
+    success = false;
+    return detections;
+  }
+  while (!feof(fp)) {
+    tDetection d;
+    double trash;
+    char str[255];
+    if (fscanf(fp, "%s %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf",
+                   str, &trash, &trash, &d.box.alpha, &d.box.x1, &d.box.y1,
+                   &d.box.x2, &d.box.y2, &d.h, &d.w, &d.l, &d.t1, &d.t2, &d.t3,
+                   &d.ry, &d.thresh)==16) {
+
+        // d.thresh = 1;
+      d.box.type = str;
+      detections.push_back(d);
+
+      // orientation=-10 is invalid, AOS is not evaluated if at least one orientation is invalid
+      if(d.box.alpha == -10)
+        compute_aos = false;
+
+      // a class is only evaluated if it is detected at least once
+      for (int c = 0; c < NUM_CLASS; c++) {
+        if (!strcasecmp(d.box.type.c_str(), CLASS_NAMES[c].c_str())) {
+          if (!eval_image[c] && d.box.x1 >= 0)
+            eval_image[c] = true;
+          if (!eval_ground[c] && d.t1 != -1000)
+            eval_ground[c] = true;
+          if (!eval_3d[c] && d.t2 != -1000)
+            eval_3d[c] = true;
+          break;
+        }
+      }
+    }
+  }
+  fclose(fp);
+  success = true;
+  return detections;
+}
+
+vector<tGroundtruth> loadGroundtruth(string file_name,bool &success) {
+
+  // holds all ground truth (ignored ground truth is indicated by an index vector
+  vector<tGroundtruth> groundtruth;
+  FILE *fp = fopen(file_name.c_str(),"r");
+  if (!fp) {
+    success = false;
+    return groundtruth;
+  }
+  while (!feof(fp)) {
+    tGroundtruth g;
+    char str[255];
+    if (fscanf(fp, "%s %lf %d %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf",
+                   str, &g.truncation, &g.occlusion, &g.box.alpha,
+                   &g.box.x1,   &g.box.y1,     &g.box.x2,    &g.box.y2,
+                   &g.h,      &g.w,        &g.l,       &g.t1,
+                   &g.t2,      &g.t3,        &g.ry )==15) {
+      g.box.type = str;
+      groundtruth.push_back(g);
+    }
+  }
+  fclose(fp);
+  success = true;
+  return groundtruth;
+}
+
+void saveStats (const vector<double> &precision, const vector<double> &aos, FILE *fp_det, FILE *fp_ori) {
+
+  // save precision to file
+  if(precision.empty())
+    return;
+  for (int32_t i=0; i<precision.size(); i++)
+    fprintf(fp_det,"%f ",precision[i]);
+  fprintf(fp_det,"\n");
+
+  // save orientation similarity, only if there were no invalid orientation entries in submission (alpha=-10)
+  if(aos.empty())
+    return;
+  for (int32_t i=0; i<aos.size(); i++)
+    fprintf(fp_ori,"%f ",aos[i]);
+  fprintf(fp_ori,"\n");
+}
+
+/*=======================================================================
+EVALUATION HELPER FUNCTIONS
+=======================================================================*/
+
+// criterion defines whether the overlap is computed with respect to both areas (ground truth and detection)
+// or with respect to box a or b (detection and "dontcare" areas)
+inline double imageBoxOverlap(tBox a, tBox b, int32_t criterion=-1){
+
+  // overlap is invalid in the beginning
+  double o = -1;
+
+  // get overlapping area
+  double x1 = max(a.x1, b.x1);
+  double y1 = max(a.y1, b.y1);
+  double x2 = min(a.x2, b.x2);
+  double y2 = min(a.y2, b.y2);
+
+  // compute width and height of overlapping area
+  double w = x2-x1;
+  double h = y2-y1;
+
+  // set invalid entries to 0 overlap
+  if(w<=0 || h<=0)
+    return 0;
+
+  // get overlapping areas
+  double inter = w*h;
+  double a_area = (a.x2-a.x1) * (a.y2-a.y1);
+  double b_area = (b.x2-b.x1) * (b.y2-b.y1);
+
+  // intersection over union overlap depending on users choice
+  if(criterion==-1)     // union
+    o = inter / (a_area+b_area-inter);
+  else if(criterion==0) // bbox_a
+    o = inter / a_area;
+  else if(criterion==1) // bbox_b
+    o = inter / b_area;
+
+  // overlap
+  return o;
+}
+
+inline double imageBoxOverlap(tDetection a, tGroundtruth b, int32_t criterion=-1){
+  return imageBoxOverlap(a.box, b.box, criterion);
+}
+
+// compute polygon of an oriented bounding box
+template <typename T>
+Polygon toPolygon(const T& g) {
+    using namespace boost::numeric::ublas;
+    using namespace boost::geometry;
+    matrix<double> mref(2, 2);
+    mref(0, 0) = cos(g.ry); mref(0, 1) = sin(g.ry);
+    mref(1, 0) = -sin(g.ry); mref(1, 1) = cos(g.ry);
+
+    static int count = 0;
+    matrix<double> corners(2, 4);
+    double data[] = {g.l / 2, g.l / 2, -g.l / 2, -g.l / 2,
+                     g.w / 2, -g.w / 2, -g.w / 2, g.w / 2};
+    std::copy(data, data + 8, corners.data().begin());
+    matrix<double> gc = prod(mref, corners);
+    for (int i = 0; i < 4; ++i) {
+        gc(0, i) += g.t1;
+        gc(1, i) += g.t3;
+    }
+
+    double points[][2] = {{gc(0, 0), gc(1, 0)},{gc(0, 1), gc(1, 1)},{gc(0, 2), gc(1, 2)},{gc(0, 3), gc(1, 3)},{gc(0, 0), gc(1, 0)}};
+    Polygon poly;
+    append(poly, points);
+    return poly;
+}
+
+// measure overlap between bird's eye view bounding boxes, parametrized by (ry, l, w, tx, tz)
+inline double groundBoxOverlap(tDetection d, tGroundtruth g, int32_t criterion = -1) {
+    using namespace boost::geometry;
+    Polygon gp = toPolygon(g);
+    Polygon dp = toPolygon(d);
+
+    std::vector<Polygon> in, un;
+    intersection(gp, dp, in);
+    union_(gp, dp, un);
+
+    double inter_area = in.empty() ? 0 : area(in.front());
+    double union_area = area(un.front());
+    double o;
+    if(criterion==-1)     // union
+        o = inter_area / union_area;
+    else if(criterion==0) // bbox_a
+        o = inter_area / area(dp);
+    else if(criterion==1) // bbox_b
+        o = inter_area / area(gp);
+
+    return o;
+}
+
+// measure overlap between 3D bounding boxes, parametrized by (ry, h, w, l, tx, ty, tz)
+inline double box3DOverlap(tDetection d, tGroundtruth g, int32_t criterion = -1) {
+    using namespace boost::geometry;
+    Polygon gp = toPolygon(g);
+    Polygon dp = toPolygon(d);
+
+    std::vector<Polygon> in, un;
+    intersection(gp, dp, in);
+    union_(gp, dp, un);
+
+    double ymax = min(d.t2, g.t2);
+    double ymin = max(d.t2 - d.h, g.t2 - g.h);
+
+    double inter_area = in.empty() ? 0 : area(in.front());
+    double inter_vol = inter_area * max(0.0, ymax - ymin);
+
+    double det_vol = d.h * d.l * d.w;
+    double gt_vol = g.h * g.l * g.w;
+
+    double o;
+    if(criterion==-1)     // union
+        o = inter_vol / (det_vol + gt_vol - inter_vol);
+    else if(criterion==0) // bbox_a
+        o = inter_vol / det_vol;
+    else if(criterion==1) // bbox_b
+        o = inter_vol / gt_vol;
+
+    return o;
+}
+
+vector<double> getThresholds(vector<double> &v, double n_groundtruth){
+
+  // holds scores needed to compute N_SAMPLE_PTS recall values
+  vector<double> t;
+
+  // sort scores in descending order
+  // (highest score is assumed to give best/most confident detections)
+  sort(v.begin(), v.end(), greater<double>());
+
+  // get scores for linearly spaced recall
+  double current_recall = 0;
+  for(int32_t i=0; i<v.size(); i++){
+
+    // check if right-hand-side recall with respect to current recall is close than left-hand-side one
+    // in this case, skip the current detection score
+    double l_recall, r_recall, recall;
+    l_recall = (double)(i+1)/n_groundtruth;
+    if(i<(v.size()-1))
+      r_recall = (double)(i+2)/n_groundtruth;
+    else
+      r_recall = l_recall;
+
+    if( (r_recall-current_recall) < (current_recall-l_recall) && i<(v.size()-1))
+      continue;
+
+    // left recall is the best approximation, so use this and goto next recall step for approximation
+    recall = l_recall;
+
+    // the next recall step was reached
+    t.push_back(v[i]);
+    current_recall += 1.0/(N_SAMPLE_PTS-1.0);
+  }
+  return t;
+}
+
+void cleanData(CLASSES current_class, const vector<tGroundtruth> &gt, const vector<tDetection> &det, vector<int32_t> &ignored_gt, vector<tGroundtruth> &dc, vector<int32_t> &ignored_det, int32_t &n_gt, DIFFICULTY difficulty){
+
+  // extract ground truth bounding boxes for current evaluation class
+  for(int32_t i=0;i<gt.size(); i++){
+
+    // only bounding boxes with a minimum height are used for evaluation
+    double height = gt[i].box.y2 - gt[i].box.y1;
+
+    // neighboring classes are ignored ("van" for "car" and "person_sitting" for "pedestrian")
+    // (lower/upper cases are ignored)
+    int32_t valid_class;
+
+    // all classes without a neighboring class
+    if(!strcasecmp(gt[i].box.type.c_str(), CLASS_NAMES[current_class].c_str()))
+      valid_class = 1;
+
+    // classes with a neighboring class
+    else if(!strcasecmp(CLASS_NAMES[current_class].c_str(), "Pedestrian") && !strcasecmp("Person_sitting", gt[i].box.type.c_str()))
+      valid_class = 0;
+    else if(!strcasecmp(CLASS_NAMES[current_class].c_str(), "Car") && !strcasecmp("Van", gt[i].box.type.c_str()))
+      valid_class = 0;
+
+    // classes not used for evaluation
+    else
+      valid_class = -1;
+
+    // ground truth is ignored, if occlusion, truncation exceeds the difficulty or ground truth is too small
+    // (doesn't count as FN nor TP, although detections may be assigned)
+    bool ignore = false;
+    if(gt[i].occlusion>MAX_OCCLUSION[difficulty] || gt[i].truncation>MAX_TRUNCATION[difficulty] || height<MIN_HEIGHT[difficulty])
+      ignore = true;
+
+    // set ignored vector for ground truth
+    // current class and not ignored (total no. of ground truth is detected for recall denominator)
+    if(valid_class==1 && !ignore){
+      ignored_gt.push_back(0);
+      n_gt++;
+    }
+
+    // neighboring class, or current class but ignored
+    else if(valid_class==0 || (ignore && valid_class==1))
+      ignored_gt.push_back(1);
+
+    // all other classes which are FN in the evaluation
+    else
+      ignored_gt.push_back(-1);
+  }
+
+  // extract dontcare areas
+  for(int32_t i=0;i<gt.size(); i++)
+    if(!strcasecmp("DontCare", gt[i].box.type.c_str()))
+      dc.push_back(gt[i]);
+
+  // extract detections bounding boxes of the current class
+  for(int32_t i=0;i<det.size(); i++){
+
+    // neighboring classes are not evaluated
+    int32_t valid_class;
+    if(!strcasecmp(det[i].box.type.c_str(), CLASS_NAMES[current_class].c_str()))
+      valid_class = 1;
+    else
+      valid_class = -1;
+
+    int32_t height = fabs(det[i].box.y1 - det[i].box.y2);
+
+    // set ignored vector for detections
+    if(height<MIN_HEIGHT[difficulty])
+      ignored_det.push_back(1);
+    else if(valid_class==1)
+      ignored_det.push_back(0);
+    else
+      ignored_det.push_back(-1);
+  }
+}
+
+tPrData computeStatistics(CLASSES current_class, const vector<tGroundtruth> &gt,
+        const vector<tDetection> &det, const vector<tGroundtruth> &dc,
+        const vector<int32_t> &ignored_gt, const vector<int32_t>  &ignored_det,
+        bool compute_fp, double (*boxoverlap)(tDetection, tGroundtruth, int32_t),
+        METRIC metric, bool compute_aos=false, double thresh=0, bool debug=false){
+
+  tPrData stat = tPrData();
+  const double NO_DETECTION = -10000000;
+  vector<double> delta;            // holds angular difference for TPs (needed for AOS evaluation)
+  vector<bool> assigned_detection; // holds wether a detection was assigned to a valid or ignored ground truth
+  assigned_detection.assign(det.size(), false);
+  vector<bool> ignored_threshold;
+  ignored_threshold.assign(det.size(), false); // holds detections with a threshold lower than thresh if FP are computed
+
+  // detections with a low score are ignored for computing precision (needs FP)
+  if(compute_fp)
+    for(int32_t i=0; i<det.size(); i++)
+      if(det[i].thresh<thresh)
+        ignored_threshold[i] = true;
+
+  // evaluate all ground truth boxes
+  for(int32_t i=0; i<gt.size(); i++){
+
+    // this ground truth is not of the current or a neighboring class and therefore ignored
+    if(ignored_gt[i]==-1)
+      continue;
+
+    /*=======================================================================
+    find candidates (overlap with ground truth > 0.5) (logical len(det))
+    =======================================================================*/
+    int32_t det_idx          = -1;
+    double valid_detection = NO_DETECTION;
+    double max_overlap     = 0;
+
+    // search for a possible detection
+    bool assigned_ignored_det = false;
+    for(int32_t j=0; j<det.size(); j++){
+
+      // detections not of the current class, already assigned or with a low threshold are ignored
+      if(ignored_det[j]==-1)
+        continue;
+      if(assigned_detection[j])
+        continue;
+      if(ignored_threshold[j])
+        continue;
+
+      // find the maximum score for the candidates and get idx of respective detection
+      double overlap = boxoverlap(det[j], gt[i], -1);
+
+      // for computing recall thresholds, the candidate with highest score is considered
+      if(!compute_fp && overlap>MIN_OVERLAP[metric][current_class] && det[j].thresh>valid_detection){
+        det_idx         = j;
+        valid_detection = det[j].thresh;
+      }
+
+      // for computing pr curve values, the candidate with the greatest overlap is considered
+      // if the greatest overlap is an ignored detection (min_height), the overlapping detection is used
+      else if(compute_fp && overlap>MIN_OVERLAP[metric][current_class] && (overlap>max_overlap || assigned_ignored_det) && ignored_det[j]==0){
+        max_overlap     = overlap;
+        det_idx         = j;
+        valid_detection = 1;
+        assigned_ignored_det = false;
+      }
+      else if(compute_fp && overlap>MIN_OVERLAP[metric][current_class] && valid_detection==NO_DETECTION && ignored_det[j]==1){
+        det_idx              = j;
+        valid_detection      = 1;
+        assigned_ignored_det = true;
+      }
+    }
+
+    /*=======================================================================
+    compute TP, FP and FN
+    =======================================================================*/
+
+    // nothing was assigned to this valid ground truth
+    if(valid_detection==NO_DETECTION && ignored_gt[i]==0) {
+      stat.fn++;
+    }
+
+    // only evaluate valid ground truth <=> detection assignments (considering difficulty level)
+    else if(valid_detection!=NO_DETECTION && (ignored_gt[i]==1 || ignored_det[det_idx]==1))
+      assigned_detection[det_idx] = true;
+
+    // found a valid true positive
+    else if(valid_detection!=NO_DETECTION){
+
+      // write highest score to threshold vector
+      stat.tp++;
+      stat.v.push_back(det[det_idx].thresh);
+
+      // compute angular difference of detection and ground truth if valid detection orientation was provided
+      if(compute_aos)
+        delta.push_back(gt[i].box.alpha - det[det_idx].box.alpha);
+
+      // clean up
+      assigned_detection[det_idx] = true;
+    }
+  }
+
+  // if FP are requested, consider stuff area
+  if(compute_fp){
+
+    // count fp
+    for(int32_t i=0; i<det.size(); i++){
+
+      // count false positives if required (height smaller than required is ignored (ignored_det==1)
+      if(!(assigned_detection[i] || ignored_det[i]==-1 || ignored_det[i]==1 || ignored_threshold[i]))
+        stat.fp++;
+    }
+
+    // do not consider detections overlapping with stuff area
+    int32_t nstuff = 0;
+    for(int32_t i=0; i<dc.size(); i++){
+      for(int32_t j=0; j<det.size(); j++){
+
+        // detections not of the current class, already assigned, with a low threshold or a low minimum height are ignored
+        if(assigned_detection[j])
+          continue;
+        if(ignored_det[j]==-1 || ignored_det[j]==1)
+          continue;
+        if(ignored_threshold[j])
+          continue;
+
+        // compute overlap and assign to stuff area, if overlap exceeds class specific value
+        double overlap = boxoverlap(det[j], dc[i], 0);
+        if(overlap>MIN_OVERLAP[metric][current_class]){
+          assigned_detection[j] = true;
+          nstuff++;
+        }
+      }
+    }
+
+    // FP = no. of all not to ground truth assigned detections - detections assigned to stuff areas
+    stat.fp -= nstuff;
+
+    // if all orientation values are valid, the AOS is computed
+    if(compute_aos){
+      vector<double> tmp;
+
+      // FP have a similarity of 0, for all TP compute AOS
+      tmp.assign(stat.fp, 0);
+      for(int32_t i=0; i<delta.size(); i++)
+        tmp.push_back((1.0+cos(delta[i]))/2.0);
+
+      // be sure, that all orientation deltas are computed
+      assert(tmp.size()==stat.fp+stat.tp);
+      assert(delta.size()==stat.tp);
+
+      // get the mean orientation similarity for this image
+      if(stat.tp>0 || stat.fp>0)
+        stat.similarity = accumulate(tmp.begin(), tmp.end(), 0.0);
+
+      // there was neither a FP nor a TP, so the similarity is ignored in the evaluation
+      else
+        stat.similarity = -1;
+    }
+  }
+  return stat;
+}
+
+/*=======================================================================
+EVALUATE CLASS-WISE
+=======================================================================*/
+
+bool eval_class (FILE *fp_det, FILE *fp_ori, CLASSES current_class,
+        const vector< vector<tGroundtruth> > &groundtruth,
+        const vector< vector<tDetection> > &detections, bool compute_aos,
+        double (*boxoverlap)(tDetection, tGroundtruth, int32_t),
+        vector<double> &precision, vector<double> &aos,
+        DIFFICULTY difficulty, METRIC metric) {
+    assert(groundtruth.size() == detections.size());
+
+  // init
+  int32_t n_gt=0;                                     // total no. of gt (denominator of recall)
+  vector<double> v, thresholds;                       // detection scores, evaluated for recall discretization
+  vector< vector<int32_t> > ignored_gt, ignored_det;  // index of ignored gt detection for current class/difficulty
+  vector< vector<tGroundtruth> > dontcare;            // index of dontcare areas, included in ground truth
+
+  // for all test images do
+  for (int32_t i=0; i<groundtruth.size(); i++){
+
+    // holds ignored ground truth, ignored detections and dontcare areas for current frame
+    vector<int32_t> i_gt, i_det;
+    vector<tGroundtruth> dc;
+
+    // only evaluate objects of current class and ignore occluded, truncated objects
+    cleanData(current_class, groundtruth[i], detections[i], i_gt, dc, i_det, n_gt, difficulty);
+    ignored_gt.push_back(i_gt);
+    ignored_det.push_back(i_det);
+    dontcare.push_back(dc);
+
+    // compute statistics to get recall values
+    tPrData pr_tmp = tPrData();
+    pr_tmp = computeStatistics(current_class, groundtruth[i], detections[i], dc, i_gt, i_det, false, boxoverlap, metric);
+
+    // add detection scores to vector over all images
+    for(int32_t j=0; j<pr_tmp.v.size(); j++)
+      v.push_back(pr_tmp.v[j]);
+  }
+
+  // get scores that must be evaluated for recall discretization
+  thresholds = getThresholds(v, n_gt);
+
+  // compute TP,FP,FN for relevant scores
+  vector<tPrData> pr;
+  pr.assign(thresholds.size(),tPrData());
+  for (int32_t i=0; i<groundtruth.size(); i++){
+
+    // for all scores/recall thresholds do:
+    for(int32_t t=0; t<thresholds.size(); t++){
+      tPrData tmp = tPrData();
+      tmp = computeStatistics(current_class, groundtruth[i], detections[i], dontcare[i],
+                              ignored_gt[i], ignored_det[i], true, boxoverlap, metric,
+                              compute_aos, thresholds[t], t==38);
+
+      // add no. of TP, FP, FN, AOS for current frame to total evaluation for current threshold
+      pr[t].tp += tmp.tp;
+      pr[t].fp += tmp.fp;
+      pr[t].fn += tmp.fn;
+      if(tmp.similarity!=-1)
+        pr[t].similarity += tmp.similarity;
+    }
+  }
+
+  // compute recall, precision and AOS
+  vector<double> recall;
+  precision.assign(N_SAMPLE_PTS, 0);
+  if(compute_aos)
+    aos.assign(N_SAMPLE_PTS, 0);
+  double r=0;
+  for (int32_t i=0; i<thresholds.size(); i++){
+    r = pr[i].tp/(double)(pr[i].tp + pr[i].fn);
+    recall.push_back(r);
+    precision[i] = pr[i].tp/(double)(pr[i].tp + pr[i].fp);
+    if(compute_aos)
+      aos[i] = pr[i].similarity/(double)(pr[i].tp + pr[i].fp);
+  }
+
+  // filter precision and AOS using max_{i..end}(precision)
+  for (int32_t i=0; i<thresholds.size(); i++){
+    precision[i] = *max_element(precision.begin()+i, precision.end());
+    if(compute_aos)
+      aos[i] = *max_element(aos.begin()+i, aos.end());
+  }
+
+  // save statisics and finish with success
+  saveStats(precision, aos, fp_det, fp_ori);
+    return true;
+}
+
+void saveAndPlotPlots(string dir_name,string file_name,string obj_type,vector<double> vals[],bool is_aos){
+
+  char command[1024];
+
+  // save plot data to file
+  FILE *fp = fopen((dir_name + "/" + file_name + ".txt").c_str(),"w");
+  printf("save %s\n", (dir_name + "/" + file_name + ".txt").c_str());
+  for (int32_t i=0; i<(int)N_SAMPLE_PTS; i++)
+    fprintf(fp,"%f %f %f %f\n",(double)i/(N_SAMPLE_PTS-1.0),vals[0][i],vals[1][i],vals[2][i]);
+  fclose(fp);
+
+  float sum[3] = {0, 0, 0};
+  for (int v = 0; v < 3; ++v)
+      for (int i = 0; i < vals[v].size(); i = i + 4)
+          sum[v] += vals[v][i];
+  printf("%s AP: %f %f %f\n", file_name.c_str(), sum[0] / 11 * 100, sum[1] / 11 * 100, sum[2] / 11 * 100);
+
+
+  // create png + eps
+  for (int32_t j=0; j<2; j++) {
+
+    // open file
+    FILE *fp = fopen((dir_name + "/" + file_name + ".gp").c_str(),"w");
+
+    // save gnuplot instructions
+    if (j==0) {
+      fprintf(fp,"set term png size 450,315 font \"Helvetica\" 11\n");
+      fprintf(fp,"set output \"%s.png\"\n",file_name.c_str());
+    } else {
+      fprintf(fp,"set term postscript eps enhanced color font \"Helvetica\" 20\n");
+      fprintf(fp,"set output \"%s.eps\"\n",file_name.c_str());
+    }
+
+    // set labels and ranges
+    fprintf(fp,"set size ratio 0.7\n");
+    fprintf(fp,"set xrange [0:1]\n");
+    fprintf(fp,"set yrange [0:1]\n");
+    fprintf(fp,"set xlabel \"Recall\"\n");
+    if (!is_aos) fprintf(fp,"set ylabel \"Precision\"\n");
+    else         fprintf(fp,"set ylabel \"Orientation Similarity\"\n");
+    obj_type[0] = toupper(obj_type[0]);
+    fprintf(fp,"set title \"%s\"\n",obj_type.c_str());
+
+    // line width
+    int32_t   lw = 5;
+    if (j==0) lw = 3;
+
+    // plot error curve
+    fprintf(fp,"plot ");
+    fprintf(fp,"\"%s.txt\" using 1:2 title 'Easy' with lines ls 1 lw %d,",file_name.c_str(),lw);
+    fprintf(fp,"\"%s.txt\" using 1:3 title 'Moderate' with lines ls 2 lw %d,",file_name.c_str(),lw);
+    fprintf(fp,"\"%s.txt\" using 1:4 title 'Hard' with lines ls 3 lw %d",file_name.c_str(),lw);
+
+    // close file
+    fclose(fp);
+
+    // run gnuplot => create png + eps
+    sprintf(command,"cd %s; gnuplot %s",dir_name.c_str(),(file_name + ".gp").c_str());
+    system(command);
+  }
+
+  // create pdf and crop
+  sprintf(command,"cd %s; ps2pdf %s.eps %s_large.pdf",dir_name.c_str(),file_name.c_str(),file_name.c_str());
+  system(command);
+  sprintf(command,"cd %s; pdfcrop %s_large.pdf %s.pdf",dir_name.c_str(),file_name.c_str(),file_name.c_str());
+  system(command);
+  sprintf(command,"cd %s; rm %s_large.pdf",dir_name.c_str(),file_name.c_str());
+  system(command);
+}
+
+vector<int32_t> getEvalIndices(const string& result_dir) {
+
+    DIR* dir;
+    dirent* entity;
+    dir = opendir(result_dir.c_str());
+    if (dir) {
+        while (entity = readdir(dir)) {
+            string path(entity->d_name);
+            int32_t len = path.size();
+            if (len < 10) continue;
+            int32_t index = atoi(path.substr(len - 10, 10).c_str());
+            indices.push_back(index);
+        }
+    }
+    return indices;
+}
+
+bool eval(string gt_dir, string result_dir, Mail* mail){
+
+  // set some global parameters
+  initGlobals();
+
+  // ground truth and result directories
+  // string gt_dir         = "data/object/label_2";
+  // string result_dir     = "results/" + result_sha;
+  string plot_dir       = result_dir + "/../plot";
+
+  // create output directories
+  system(("mkdir " + plot_dir).c_str());
+
+  // hold detections and ground truth in memory
+  vector< vector<tGroundtruth> > groundtruth;
+  vector< vector<tDetection> >   detections;
+
+  // holds wether orientation similarity shall be computed (might be set to false while loading detections)
+  // and which labels where provided by this submission
+  bool compute_aos=true;
+  vector<bool> eval_image(NUM_CLASS, false);
+  vector<bool> eval_ground(NUM_CLASS, false);
+  vector<bool> eval_3d(NUM_CLASS, false);
+
+  // for all images read groundtruth and detections
+  mail->msg("Loading detections...");
+  std::vector<int32_t> indices = getEvalIndices(result_dir);
+  printf("number of files for evaluation: %d\n", (int)indices.size());
+
+  for (int32_t i=0; i<indices.size(); i++) {
+
+    // file name
+    char file_name[256];
+    sprintf(file_name,"%06d.txt",indices.at(i));
+
+    // read ground truth and result poses
+    bool gt_success,det_success;
+    vector<tGroundtruth> gt   = loadGroundtruth(gt_dir + "/" + file_name,gt_success);
+    vector<tDetection>   det  = loadDetections(result_dir + file_name,
+            compute_aos, eval_image, eval_ground, eval_3d, det_success);
+    groundtruth.push_back(gt);
+    detections.push_back(det);
+
+    // check for errors
+    if (!gt_success) {
+      mail->msg("ERROR: Couldn't read: %s of ground truth. Please write me an email!", file_name);
+      return false;
+    }
+    if (!det_success) {
+      mail->msg("ERROR: Couldn't read: %s", file_name);
+      return false;
+    }
+  }
+  mail->msg("  done.");
+
+  // holds pointers for result files
+  FILE *fp_det=0, *fp_ori=0;
+
+  // eval image 2D bounding boxes
+  for (int c = 0; c < NUM_CLASS; c++) {
+    CLASSES cls = (CLASSES)c;
+    if (eval_image[c]) {
+      fp_det = fopen((result_dir + "/../stats_" + CLASS_NAMES[c] + "_detection.txt").c_str(), "w");
+      if(compute_aos)
+        fp_ori = fopen((result_dir + "/../stats_" + CLASS_NAMES[c] + "_orientation.txt").c_str(),"w");
+      vector<double> precision[3], aos[3];
+      if(   !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, imageBoxOverlap, precision[0], aos[0], EASY, IMAGE)
+         || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, imageBoxOverlap, precision[1], aos[1], MODERATE, IMAGE)
+         || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, imageBoxOverlap, precision[2], aos[2], HARD, IMAGE)) {
+        mail->msg("%s evaluation failed.", CLASS_NAMES[c].c_str());
+        return false;
+      }
+      fclose(fp_det);
+      saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_detection", CLASS_NAMES[c], precision, 0);
+      if(compute_aos){
+        saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_orientation", CLASS_NAMES[c], aos, 1);
+        fclose(fp_ori);
+      }
+    }
+  }
+
+  // don't evaluate AOS for birdview boxes and 3D boxes
+  compute_aos = false;
+
+  // eval bird's eye view bounding boxes
+  for (int c = 0; c < NUM_CLASS; c++) {
+    CLASSES cls = (CLASSES)c;
+    if (eval_ground[c]) {
+      fp_det = fopen((result_dir + "/../stats_" + CLASS_NAMES[c] + "_detection_ground.txt").c_str(), "w");
+      vector<double> precision[3], aos[3];
+      if(   !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, groundBoxOverlap, precision[0], aos[0], EASY, GROUND)
+         || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, groundBoxOverlap, precision[1], aos[1], MODERATE, GROUND)
+         || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, groundBoxOverlap, precision[2], aos[2], HARD, GROUND)) {
+        mail->msg("%s evaluation failed.", CLASS_NAMES[c].c_str());
+        return false;
+      }
+      fclose(fp_det);
+      saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_detection_ground", CLASS_NAMES[c], precision, 0);
+    }
+  }
+
+  // eval 3D bounding boxes
+  for (int c = 0; c < NUM_CLASS; c++) {
+    CLASSES cls = (CLASSES)c;
+    if (eval_3d[c]) {
+      fp_det = fopen((result_dir + "/../stats_" + CLASS_NAMES[c] + "_detection_3d.txt").c_str(), "w");
+      vector<double> precision[3], aos[3];
+      if(   !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, box3DOverlap, precision[0], aos[0], EASY, BOX3D)
+         || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, box3DOverlap, precision[1], aos[1], MODERATE, BOX3D)
+         || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, box3DOverlap, precision[2], aos[2], HARD, BOX3D)) {
+        mail->msg("%s evaluation failed.", CLASS_NAMES[c].c_str());
+        return false;
+      }
+      fclose(fp_det);
+      saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_detection_3d", CLASS_NAMES[c], precision, 0);
+    }
+  }
+
+  // success
+  return true;
+}
+
+int32_t main (int32_t argc,char *argv[]) {
+
+  // we need 2 or 4 arguments!
+  if (argc!=3) {
+    cout << "Usage: ./eval_detection_3d_offline gt_dir result_dir" << endl;
+    return 1;
+  }
+
+  // read arguments
+  string gt_dir = argv[1];
+  string result_dir = argv[2];
+
+  // init notification mail
+  Mail *mail;
+  mail = new Mail();
+  mail->msg("Thank you for participating in our evaluation!");
+
+  // run evaluation
+  if (eval(gt_dir, result_dir, mail)) {
+    mail->msg("Your evaluation results are available at:");
+    mail->msg(result_dir.c_str());
+  } else {
+    system(("rm -r " + result_dir + "/../plot").c_str());
+    mail->msg("An error occured while processing your results.");
+  }
+
+  // send mail and exit
+  delete mail;
+
+  return 0;
+}
+
+
diff --git a/src/tools/kitti_eval/mail.h b/src/tools/kitti_eval/mail.h
new file mode 100644
index 0000000..20fa986
--- /dev/null
+++ b/src/tools/kitti_eval/mail.h
@@ -0,0 +1,48 @@
+#ifndef MAIL_H
+#define MAIL_H
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+
+class Mail {
+
+public:
+
+  Mail (std::string email = "") {
+    if (email.compare("")) {
+      mail = popen("/usr/lib/sendmail -t -f noreply@cvlibs.net","w");
+      fprintf(mail,"To: %s\n", email.c_str());
+      fprintf(mail,"From: noreply@cvlibs.net\n");
+      fprintf(mail,"Subject: KITTI Evaluation Benchmark\n");
+      fprintf(mail,"\n\n");
+    } else {
+      mail = 0;
+    }
+  }
+  
+  ~Mail() {
+    if (mail) {
+      pclose(mail);
+    }
+  }
+  
+  void msg (const char *format, ...) {
+    va_list args;
+    va_start(args,format);
+    if (mail) {
+      vfprintf(mail,format,args);
+      fprintf(mail,"\n");
+    }
+    vprintf(format,args);
+    printf("\n");
+    va_end(args);
+  }
+    
+private:
+
+  FILE *mail;
+  
+};
+
+#endif
diff --git a/src/tools/merge_pascal_json.py b/src/tools/merge_pascal_json.py
new file mode 100644
index 0000000..80765d3
--- /dev/null
+++ b/src/tools/merge_pascal_json.py
@@ -0,0 +1,30 @@
+import json
+
+# ANNOT_PATH = '/home/zxy/Datasets/VOC/annotations/'
+ANNOT_PATH = 'voc/annotations/'
+OUT_PATH = ANNOT_PATH
+INPUT_FILES = ['pascal_train2012.json', 'pascal_val2012.json',
+               'pascal_train2007.json', 'pascal_val2007.json']
+OUTPUT_FILE = 'pascal_trainval0712.json'
+KEYS = ['images', 'type', 'annotations', 'categories']
+MERGE_KEYS = ['images', 'annotations']
+
+out = {}
+tot_anns = 0
+for i, file_name in enumerate(INPUT_FILES):
+  data = json.load(open(ANNOT_PATH + file_name, 'r'))
+  print('keys', data.keys())
+  if i == 0:
+    for key in KEYS:
+      out[key] = data[key]
+      print(file_name, key, len(data[key]))
+  else:
+    out['images'] += data['images']
+    for j in range(len(data['annotations'])):
+      data['annotations'][j]['id'] += tot_anns
+    out['annotations'] += data['annotations']
+    print(file_name, 'images', len(data['images']))
+    print(file_name, 'annotations', len(data['annotations']))
+  tot_anns = len(out['annotations'])
+print('tot', len(out['annotations']))
+json.dump(out, open(OUT_PATH + OUTPUT_FILE, 'w'))
diff --git a/src/tools/reval.py b/src/tools/reval.py
new file mode 100755
index 0000000..c2d881c
--- /dev/null
+++ b/src/tools/reval.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# Modified by Xingyi Zhou
+# --------------------------------------------------------
+
+# Reval = re-eval. Re-evaluate saved detections.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import os.path as osp
+sys.path.insert(0, osp.join(osp.dirname(__file__), 'voc_eval_lib'))
+
+from model.test import apply_nms
+from datasets.pascal_voc import pascal_voc
+import pickle
+import os, argparse
+import numpy as np
+import json
+
+def parse_args():
+  """
+  Parse input arguments
+  """
+  parser = argparse.ArgumentParser(description='Re-evaluate results')
+  parser.add_argument('detection_file', type=str)
+  parser.add_argument('--output_dir', help='results directory', type=str)
+  parser.add_argument('--imdb', dest='imdb_name',
+                      help='dataset to re-evaluate',
+                      default='voc_2007_test', type=str)
+  parser.add_argument('--matlab', dest='matlab_eval',
+                      help='use matlab for evaluation',
+                      action='store_true')
+  parser.add_argument('--comp', dest='comp_mode', help='competition mode',
+                      action='store_true')
+  parser.add_argument('--nms', dest='apply_nms', help='apply nms',
+                      action='store_true')
+
+  if len(sys.argv) == 1:
+    parser.print_help()
+    sys.exit(1)
+
+  args = parser.parse_args()
+  return args
+
+
+def from_dets(imdb_name, detection_file, args):
+  imdb = pascal_voc('test', '2007')
+  imdb.competition_mode(args.comp_mode)
+  imdb.config['matlab_eval'] = args.matlab_eval
+  with open(os.path.join(detection_file), 'rb') as f:
+    if 'json' in detection_file:
+      dets = json.load(f)
+    else:
+      dets = pickle.load(f, encoding='latin1')
+  # import pdb; pdb.set_trace()
+  if args.apply_nms:
+    print('Applying NMS to all detections')
+    test_nms = 0.3
+    nms_dets = apply_nms(dets, test_nms)
+  else:
+    nms_dets = dets
+
+  print('Evaluating detections')
+  imdb.evaluate_detections(nms_dets)
+
+
+if __name__ == '__main__':
+  args = parse_args()
+
+  imdb_name = args.imdb_name
+  from_dets(imdb_name, args.detection_file, args)
diff --git a/src/tools/vis_pred.py b/src/tools/vis_pred.py
new file mode 100644
index 0000000..832e13a
--- /dev/null
+++ b/src/tools/vis_pred.py
@@ -0,0 +1,104 @@
+import pycocotools.coco as coco
+from pycocotools.cocoeval import COCOeval
+import sys
+import cv2
+import numpy as np
+import pickle
+IMG_PATH = '../../data/coco/val2017/'
+ANN_PATH = '../../data/coco/annotations/instances_val2017.json'
+DEBUG = True
+
+def _coco_box_to_bbox(box):
+  bbox = np.array([box[0], box[1], box[0] + box[2], box[1] + box[3]],
+                  dtype=np.int32)
+  return bbox
+
+_cat_ids = [
+  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 
+  14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 
+  24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 
+  37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 
+  58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 
+  72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 
+  82, 84, 85, 86, 87, 88, 89, 90
+]
+num_classes = 80
+_classes = {
+  ind + 1: cat_id for ind, cat_id in enumerate(_cat_ids)
+}
+_to_order = {cat_id: ind for ind, cat_id in enumerate(_cat_ids)}
+coco = coco.COCO(ANN_PATH)
+CAT_NAMES = [coco.loadCats([_classes[i + 1]])[0]['name'] \
+              for i in range(num_classes)]
+COLORS = [((np.random.random((3, )) * 0.6 + 0.4)*255).astype(np.uint8) \
+              for _ in range(num_classes)]
+
+
+def add_box(image, bbox, sc, cat_id):
+  cat_id = _to_order[cat_id]
+  cat_name = CAT_NAMES[cat_id]
+  cat_size  = cv2.getTextSize(cat_name + '0', cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)[0]
+  color = np.array(COLORS[cat_id]).astype(np.int32).tolist()
+  txt = '{}{:.0f}'.format(cat_name, sc * 10)
+  if bbox[1] - cat_size[1] - 2 < 0:
+    cv2.rectangle(image,
+                  (bbox[0], bbox[1] + 2),
+                  (bbox[0] + cat_size[0], bbox[1] + cat_size[1] + 2),
+                  color, -1)
+    cv2.putText(image, txt, 
+                (bbox[0], bbox[1] + cat_size[1] + 2), 
+                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), thickness=1)
+  else:
+    cv2.rectangle(image,
+                  (bbox[0], bbox[1] - cat_size[1] - 2),
+                  (bbox[0] + cat_size[0], bbox[1] - 2),
+                  color, -1)
+    cv2.putText(image, txt, 
+                (bbox[0], bbox[1] - 2), 
+                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), thickness=1)
+  cv2.rectangle(image,
+                (bbox[0], bbox[1]),
+                (bbox[2], bbox[3]),
+                color, 2)
+  return image
+
+if __name__ == '__main__':
+  dets = []
+  img_ids = coco.getImgIds()
+  num_images = len(img_ids)
+  for k in range(1, len(sys.argv)):
+    pred_path = sys.argv[k]
+    dets.append(coco.loadRes(pred_path))
+  # import pdb; pdb.set_trace()
+  for i, img_id in enumerate(img_ids):
+    img_info = coco.loadImgs(ids=[img_id])[0]
+    img_path = IMG_PATH + img_info['file_name']
+    img = cv2.imread(img_path)
+    gt_ids = coco.getAnnIds(imgIds=[img_id])
+    gts = coco.loadAnns(gt_ids)
+    gt_img = img.copy()
+    for j, pred in enumerate(gts):
+      bbox = _coco_box_to_bbox(pred['bbox'])
+      cat_id = pred['category_id']
+      gt_img = add_box(gt_img, bbox, 0, cat_id)
+    for k in range(len(dets)):
+      pred_ids = dets[k].getAnnIds(imgIds=[img_id])
+      preds = dets[k].loadAnns(pred_ids)
+      pred_img = img.copy()
+      for j, pred in enumerate(preds):
+        bbox = _coco_box_to_bbox(pred['bbox'])
+        sc = pred['score']
+        cat_id = pred['category_id']
+        if sc > 0.2:
+          pred_img = add_box(pred_img, bbox, sc, cat_id)
+      cv2.imshow('pred{}'.format(k), pred_img)
+      # cv2.imwrite('vis/{}_pred{}.png'.format(i, k), pred_img)
+    cv2.imshow('gt', gt_img)
+    # cv2.imwrite('vis/{}_gt.png'.format(i), gt_img)
+    cv2.waitKey()
+  # coco_eval.evaluate()
+  # coco_eval.accumulate()
+  # coco_eval.summarize()
+
+  
diff --git a/src/tools/voc_eval_lib/LICENSE b/src/tools/voc_eval_lib/LICENSE
new file mode 100644
index 0000000..f68854d
--- /dev/null
+++ b/src/tools/voc_eval_lib/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017 Xinlei Chen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/src/tools/voc_eval_lib/Makefile b/src/tools/voc_eval_lib/Makefile
new file mode 100644
index 0000000..1e9e686
--- /dev/null
+++ b/src/tools/voc_eval_lib/Makefile
@@ -0,0 +1,6 @@
+all:
+	python setup.py build_ext --inplace
+	rm -rf build
+clean:
+	rm -rf */*.pyc
+	rm -rf */*.so
diff --git a/src/tools/voc_eval_lib/__init__.py b/src/tools/voc_eval_lib/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/tools/voc_eval_lib/datasets/__init__.py b/src/tools/voc_eval_lib/datasets/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/tools/voc_eval_lib/datasets/bbox.pyx b/src/tools/voc_eval_lib/datasets/bbox.pyx
new file mode 100644
index 0000000..0f9c696
--- /dev/null
+++ b/src/tools/voc_eval_lib/datasets/bbox.pyx
@@ -0,0 +1,56 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Sergey Karayev
+# --------------------------------------------------------
+
+cimport cython
+import numpy as np
+cimport numpy as np
+
+DTYPE = np.float
+ctypedef np.float_t DTYPE_t
+
+def bbox_overlaps(
+        np.ndarray[DTYPE_t, ndim=2] boxes,
+        np.ndarray[DTYPE_t, ndim=2] query_boxes):
+    """
+    Parameters
+    ----------
+    boxes: (N, 4) ndarray of float
+    query_boxes: (K, 4) ndarray of float
+    Returns
+    -------
+    overlaps: (N, K) ndarray of overlap between boxes and query_boxes
+    """
+    cdef unsigned int N = boxes.shape[0]
+    cdef unsigned int K = query_boxes.shape[0]
+    cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
+    cdef DTYPE_t iw, ih, box_area
+    cdef DTYPE_t ua
+    cdef unsigned int k, n
+    for k in range(K):
+        box_area = (
+            (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
+            (query_boxes[k, 3] - query_boxes[k, 1] + 1)
+        )
+        for n in range(N):
+            iw = (
+                min(boxes[n, 2], query_boxes[k, 2]) -
+                max(boxes[n, 0], query_boxes[k, 0]) + 1
+            )
+            if iw > 0:
+                ih = (
+                    min(boxes[n, 3], query_boxes[k, 3]) -
+                    max(boxes[n, 1], query_boxes[k, 1]) + 1
+                )
+                if ih > 0:
+                    ua = float(
+                        (boxes[n, 2] - boxes[n, 0] + 1) *
+                        (boxes[n, 3] - boxes[n, 1] + 1) +
+                        box_area - iw * ih
+                    )
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
+
diff --git a/src/tools/voc_eval_lib/datasets/ds_utils.py b/src/tools/voc_eval_lib/datasets/ds_utils.py
new file mode 100644
index 0000000..fd5ca4b
--- /dev/null
+++ b/src/tools/voc_eval_lib/datasets/ds_utils.py
@@ -0,0 +1,49 @@
+# --------------------------------------------------------
+# Fast/er R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def unique_boxes(boxes, scale=1.0):
+  """Return indices of unique boxes."""
+  v = np.array([1, 1e3, 1e6, 1e9])
+  hashes = np.round(boxes * scale).dot(v)
+  _, index = np.unique(hashes, return_index=True)
+  return np.sort(index)
+
+
+def xywh_to_xyxy(boxes):
+  """Convert [x y w h] box format to [x1 y1 x2 y2] format."""
+  return np.hstack((boxes[:, 0:2], boxes[:, 0:2] + boxes[:, 2:4] - 1))
+
+
+def xyxy_to_xywh(boxes):
+  """Convert [x1 y1 x2 y2] box format to [x y w h] format."""
+  return np.hstack((boxes[:, 0:2], boxes[:, 2:4] - boxes[:, 0:2] + 1))
+
+
+def validate_boxes(boxes, width=0, height=0):
+  """Check that a set of boxes are valid."""
+  x1 = boxes[:, 0]
+  y1 = boxes[:, 1]
+  x2 = boxes[:, 2]
+  y2 = boxes[:, 3]
+  assert (x1 >= 0).all()
+  assert (y1 >= 0).all()
+  assert (x2 >= x1).all()
+  assert (y2 >= y1).all()
+  assert (x2 < width).all()
+  assert (y2 < height).all()
+
+
+def filter_small_boxes(boxes, min_size):
+  w = boxes[:, 2] - boxes[:, 0]
+  h = boxes[:, 3] - boxes[:, 1]
+  keep = np.where((w >= min_size) & (h > min_size))[0]
+  return keep
diff --git a/src/tools/voc_eval_lib/datasets/imdb.py b/src/tools/voc_eval_lib/datasets/imdb.py
new file mode 100644
index 0000000..86709f0
--- /dev/null
+++ b/src/tools/voc_eval_lib/datasets/imdb.py
@@ -0,0 +1,268 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick and Xinlei Chen
+# Modified by Xingyi Zhou
+# --------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import os.path as osp
+import PIL
+# from utils.cython_bbox import bbox_overlaps
+import numpy as np
+import scipy.sparse
+from model.config import cfg
+
+def bbox_overlaps(box1, box2):
+  area1 = (box1[2] - box1[0] + 1) * (box1[3] - box1[1] + 1)
+  area2 = (box2[2] - box2[0] + 1) * (box2[3] - box2[1] + 1)
+  inter = max(min(box1[2], box2[2]) - max(box1[0], box2[0]) + 1, 0) * \
+          max(min(box1[3], box2[3]) - max(box1[1], box2[1]) + 1, 0)
+  iou = 1.0 * inter / (area1 + area2 - inter)
+  return iou
+ 
+class imdb(object):
+  """Image database."""
+
+  def __init__(self, name, classes=None):
+    self._name = name
+    self._num_classes = 0
+    if not classes:
+      self._classes = []
+    else:
+      self._classes = classes
+    self._image_index = []
+    self._obj_proposer = 'gt'
+    self._roidb = None
+    self._roidb_handler = self.default_roidb
+    # Use this dict for storing dataset specific config options
+    self.config = {}
+
+  @property
+  def name(self):
+    return self._name
+
+  @property
+  def num_classes(self):
+    return len(self._classes)
+
+  @property
+  def classes(self):
+    return self._classes
+
+  @property
+  def image_index(self):
+    return self._image_index
+
+  @property
+  def roidb_handler(self):
+    return self._roidb_handler
+
+  @roidb_handler.setter
+  def roidb_handler(self, val):
+    self._roidb_handler = val
+
+  def set_proposal_method(self, method):
+    method = eval('self.' + method + '_roidb')
+    self.roidb_handler = method
+
+  @property
+  def roidb(self):
+    # A roidb is a list of dictionaries, each with the following keys:
+    #   boxes
+    #   gt_overlaps
+    #   gt_classes
+    #   flipped
+    if self._roidb is not None:
+      return self._roidb
+    self._roidb = self.roidb_handler()
+    return self._roidb
+
+  @property
+  def cache_path(self):
+    cache_path = osp.abspath(osp.join(cfg.DATA_DIR, 'cache'))
+    if not os.path.exists(cache_path):
+      os.makedirs(cache_path)
+    return cache_path
+
+  @property
+  def num_images(self):
+    return len(self.image_index)
+
+  def image_path_at(self, i):
+    raise NotImplementedError
+
+  def default_roidb(self):
+    raise NotImplementedError
+
+  def evaluate_detections(self, all_boxes, output_dir=None):
+    """
+    all_boxes is a list of length number-of-classes.
+    Each list element is a list of length number-of-images.
+    Each of those list elements is either an empty list []
+    or a numpy array of detection.
+
+    all_boxes[class][image] = [] or np.array of shape #dets x 5
+    """
+    raise NotImplementedError
+
+  def _get_widths(self):
+    return [PIL.Image.open(self.image_path_at(i)).size[0]
+            for i in range(self.num_images)]
+
+  def append_flipped_images(self):
+    num_images = self.num_images
+    widths = self._get_widths()
+    for i in range(num_images):
+      boxes = self.roidb[i]['boxes'].copy()
+      oldx1 = boxes[:, 0].copy()
+      oldx2 = boxes[:, 2].copy()
+      boxes[:, 0] = widths[i] - oldx2 - 1
+      boxes[:, 2] = widths[i] - oldx1 - 1
+      assert (boxes[:, 2] >= boxes[:, 0]).all()
+      entry = {'boxes': boxes,
+               'gt_overlaps': self.roidb[i]['gt_overlaps'],
+               'gt_classes': self.roidb[i]['gt_classes'],
+               'flipped': True}
+      self.roidb.append(entry)
+    self._image_index = self._image_index * 2
+
+  def evaluate_recall(self, candidate_boxes=None, thresholds=None,
+                      area='all', limit=None):
+    """Evaluate detection proposal recall metrics.
+
+    Returns:
+        results: dictionary of results with keys
+            'ar': average recall
+            'recalls': vector recalls at each IoU overlap threshold
+            'thresholds': vector of IoU overlap thresholds
+            'gt_overlaps': vector of all ground-truth overlaps
+    """
+    # Record max overlap value for each gt box
+    # Return vector of overlap values
+    areas = {'all': 0, 'small': 1, 'medium': 2, 'large': 3,
+             '96-128': 4, '128-256': 5, '256-512': 6, '512-inf': 7}
+    area_ranges = [[0 ** 2, 1e5 ** 2],  # all
+                   [0 ** 2, 32 ** 2],  # small
+                   [32 ** 2, 96 ** 2],  # medium
+                   [96 ** 2, 1e5 ** 2],  # large
+                   [96 ** 2, 128 ** 2],  # 96-128
+                   [128 ** 2, 256 ** 2],  # 128-256
+                   [256 ** 2, 512 ** 2],  # 256-512
+                   [512 ** 2, 1e5 ** 2],  # 512-inf
+                   ]
+    assert area in areas, 'unknown area range: {}'.format(area)
+    area_range = area_ranges[areas[area]]
+    gt_overlaps = np.zeros(0)
+    num_pos = 0
+    for i in range(self.num_images):
+      # Checking for max_overlaps == 1 avoids including crowd annotations
+      # (...pretty hacking :/)
+      max_gt_overlaps = self.roidb[i]['gt_overlaps'].toarray().max(axis=1)
+      gt_inds = np.where((self.roidb[i]['gt_classes'] > 0) &
+                         (max_gt_overlaps == 1))[0]
+      gt_boxes = self.roidb[i]['boxes'][gt_inds, :]
+      gt_areas = self.roidb[i]['seg_areas'][gt_inds]
+      valid_gt_inds = np.where((gt_areas >= area_range[0]) &
+                               (gt_areas <= area_range[1]))[0]
+      gt_boxes = gt_boxes[valid_gt_inds, :]
+      num_pos += len(valid_gt_inds)
+
+      if candidate_boxes is None:
+        # If candidate_boxes is not supplied, the default is to use the
+        # non-ground-truth boxes from this roidb
+        non_gt_inds = np.where(self.roidb[i]['gt_classes'] == 0)[0]
+        boxes = self.roidb[i]['boxes'][non_gt_inds, :]
+      else:
+        boxes = candidate_boxes[i]
+      if boxes.shape[0] == 0:
+        continue
+      if limit is not None and boxes.shape[0] > limit:
+        boxes = boxes[:limit, :]
+
+      overlaps = bbox_overlaps(boxes.astype(np.float),
+                               gt_boxes.astype(np.float))
+
+      _gt_overlaps = np.zeros((gt_boxes.shape[0]))
+      for j in range(gt_boxes.shape[0]):
+        # find which proposal box maximally covers each gt box
+        argmax_overlaps = overlaps.argmax(axis=0)
+        # and get the iou amount of coverage for each gt box
+        max_overlaps = overlaps.max(axis=0)
+        # find which gt box is 'best' covered (i.e. 'best' = most iou)
+        gt_ind = max_overlaps.argmax()
+        gt_ovr = max_overlaps.max()
+        assert (gt_ovr >= 0)
+        # find the proposal box that covers the best covered gt box
+        box_ind = argmax_overlaps[gt_ind]
+        # record the iou coverage of this gt box
+        _gt_overlaps[j] = overlaps[box_ind, gt_ind]
+        assert (_gt_overlaps[j] == gt_ovr)
+        # mark the proposal box and the gt box as used
+        overlaps[box_ind, :] = -1
+        overlaps[:, gt_ind] = -1
+      # append recorded iou coverage level
+      gt_overlaps = np.hstack((gt_overlaps, _gt_overlaps))
+
+    gt_overlaps = np.sort(gt_overlaps)
+    if thresholds is None:
+      step = 0.05
+      thresholds = np.arange(0.5, 0.95 + 1e-5, step)
+    recalls = np.zeros_like(thresholds)
+    # compute recall for each iou threshold
+    for i, t in enumerate(thresholds):
+      recalls[i] = (gt_overlaps >= t).sum() / float(num_pos)
+    # ar = 2 * np.trapz(recalls, thresholds)
+    ar = recalls.mean()
+    return {'ar': ar, 'recalls': recalls, 'thresholds': thresholds,
+            'gt_overlaps': gt_overlaps}
+
+  def create_roidb_from_box_list(self, box_list, gt_roidb):
+    assert len(box_list) == self.num_images, \
+      'Number of boxes must match number of ground-truth images'
+    roidb = []
+    for i in range(self.num_images):
+      boxes = box_list[i]
+      num_boxes = boxes.shape[0]
+      overlaps = np.zeros((num_boxes, self.num_classes), dtype=np.float32)
+
+      if gt_roidb is not None and gt_roidb[i]['boxes'].size > 0:
+        gt_boxes = gt_roidb[i]['boxes']
+        gt_classes = gt_roidb[i]['gt_classes']
+        gt_overlaps = bbox_overlaps(boxes.astype(np.float),
+                                    gt_boxes.astype(np.float))
+        argmaxes = gt_overlaps.argmax(axis=1)
+        maxes = gt_overlaps.max(axis=1)
+        I = np.where(maxes > 0)[0]
+        overlaps[I, gt_classes[argmaxes[I]]] = maxes[I]
+
+      overlaps = scipy.sparse.csr_matrix(overlaps)
+      roidb.append({
+        'boxes': boxes,
+        'gt_classes': np.zeros((num_boxes,), dtype=np.int32),
+        'gt_overlaps': overlaps,
+        'flipped': False,
+        'seg_areas': np.zeros((num_boxes,), dtype=np.float32),
+      })
+    return roidb
+
+  @staticmethod
+  def merge_roidbs(a, b):
+    assert len(a) == len(b)
+    for i in range(len(a)):
+      a[i]['boxes'] = np.vstack((a[i]['boxes'], b[i]['boxes']))
+      a[i]['gt_classes'] = np.hstack((a[i]['gt_classes'],
+                                      b[i]['gt_classes']))
+      a[i]['gt_overlaps'] = scipy.sparse.vstack([a[i]['gt_overlaps'],
+                                                 b[i]['gt_overlaps']])
+      a[i]['seg_areas'] = np.hstack((a[i]['seg_areas'],
+                                     b[i]['seg_areas']))
+    return a
+
+  def competition_mode(self, on):
+    """Turn competition mode on or off."""
+    pass
diff --git a/src/tools/voc_eval_lib/datasets/pascal_voc.py b/src/tools/voc_eval_lib/datasets/pascal_voc.py
new file mode 100644
index 0000000..6852c06
--- /dev/null
+++ b/src/tools/voc_eval_lib/datasets/pascal_voc.py
@@ -0,0 +1,313 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick and Xinlei Chen
+# --------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+from datasets.imdb import imdb
+import datasets.ds_utils as ds_utils
+import xml.etree.ElementTree as ET
+import numpy as np
+import scipy.sparse
+import scipy.io as sio
+# import utils.cython_bbox
+import pickle
+import subprocess
+import uuid
+from .voc_eval import voc_eval
+from model.config import cfg
+
+
+class pascal_voc(imdb):
+  def __init__(self, image_set, year, use_diff=False):
+    name = 'voc_' + year + '_' + image_set
+    if use_diff:
+      name += '_diff'
+    imdb.__init__(self, name)
+    self._year = year
+    self._image_set = image_set
+    self._devkit_path = self._get_default_path()
+    self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year)
+    self._classes = ('__background__',  # always index 0
+                     'aeroplane', 'bicycle', 'bird', 'boat',
+                     'bottle', 'bus', 'car', 'cat', 'chair',
+                     'cow', 'diningtable', 'dog', 'horse',
+                     'motorbike', 'person', 'pottedplant',
+                     'sheep', 'sofa', 'train', 'tvmonitor')
+    self._class_to_ind = dict(list(zip(self.classes, list(range(self.num_classes)))))
+    self._image_ext = '.jpg'
+    self._image_index = self._load_image_set_index()
+    # Default to roidb handler
+    self._roidb_handler = self.gt_roidb
+    self._salt = str(uuid.uuid4())
+    self._comp_id = 'comp4'
+
+    # PASCAL specific config options
+    self.config = {'cleanup': True,
+                   'use_salt': True,
+                   'use_diff': use_diff,
+                   'matlab_eval': False,
+                   'rpn_file': None}
+
+    assert os.path.exists(self._devkit_path), \
+      'VOCdevkit path does not exist: {}'.format(self._devkit_path)
+    assert os.path.exists(self._data_path), \
+      'Path does not exist: {}'.format(self._data_path)
+
+  def image_path_at(self, i):
+    """
+    Return the absolute path to image i in the image sequence.
+    """
+    return self.image_path_from_index(self._image_index[i])
+
+  def image_path_from_index(self, index):
+    """
+    Construct an image path from the image's "index" identifier.
+    """
+    image_path = os.path.join(self._data_path, 'JPEGImages',
+                              index + self._image_ext)
+    assert os.path.exists(image_path), \
+      'Path does not exist: {}'.format(image_path)
+    return image_path
+
+  def _load_image_set_index(self):
+    """
+    Load the indexes listed in this dataset's image set file.
+    """
+    # Example path to image set file:
+    # self._devkit_path + /VOCdevkit2007/VOC2007/ImageSets/Main/val.txt
+    image_set_file = os.path.join(self._data_path, 'ImageSets', 'Main',
+                                  self._image_set + '.txt')
+    assert os.path.exists(image_set_file), \
+      'Path does not exist: {}'.format(image_set_file)
+    with open(image_set_file) as f:
+      image_index = [x.strip() for x in f.readlines()]
+    return image_index
+
+  def _get_default_path(self):
+    """
+    Return the default path where PASCAL VOC is expected to be installed.
+    """
+    return os.path.join(cfg.DATA_DIR, 'voc', 'VOCdevkit')
+
+  def gt_roidb(self):
+    """
+    Return the database of ground-truth regions of interest.
+
+    This function loads/saves from/to a cache file to speed up future calls.
+    """
+    cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl')
+    if os.path.exists(cache_file):
+      with open(cache_file, 'rb') as fid:
+        try:
+          roidb = pickle.load(fid)
+        except:
+          roidb = pickle.load(fid, encoding='bytes')
+      print('{} gt roidb loaded from {}'.format(self.name, cache_file))
+      return roidb
+
+    gt_roidb = [self._load_pascal_annotation(index)
+                for index in self.image_index]
+    with open(cache_file, 'wb') as fid:
+      pickle.dump(gt_roidb, fid, pickle.HIGHEST_PROTOCOL)
+    print('wrote gt roidb to {}'.format(cache_file))
+
+    return gt_roidb
+
+  def rpn_roidb(self):
+    if int(self._year) == 2007 or self._image_set != 'test':
+      gt_roidb = self.gt_roidb()
+      rpn_roidb = self._load_rpn_roidb(gt_roidb)
+      roidb = imdb.merge_roidbs(gt_roidb, rpn_roidb)
+    else:
+      roidb = self._load_rpn_roidb(None)
+
+    return roidb
+
+  def _load_rpn_roidb(self, gt_roidb):
+    filename = self.config['rpn_file']
+    print('loading {}'.format(filename))
+    assert os.path.exists(filename), \
+      'rpn data not found at: {}'.format(filename)
+    with open(filename, 'rb') as f:
+      box_list = pickle.load(f)
+    return self.create_roidb_from_box_list(box_list, gt_roidb)
+
+  def _load_pascal_annotation(self, index):
+    """
+    Load image and bounding boxes info from XML file in the PASCAL VOC
+    format.
+    """
+    filename = os.path.join(self._data_path, 'Annotations', index + '.xml')
+    tree = ET.parse(filename)
+    objs = tree.findall('object')
+    if not self.config['use_diff']:
+      # Exclude the samples labeled as difficult
+      non_diff_objs = [
+        obj for obj in objs if int(obj.find('difficult').text) == 0]
+      # if len(non_diff_objs) != len(objs):
+      #     print 'Removed {} difficult objects'.format(
+      #         len(objs) - len(non_diff_objs))
+      objs = non_diff_objs
+    num_objs = len(objs)
+
+    boxes = np.zeros((num_objs, 4), dtype=np.uint16)
+    gt_classes = np.zeros((num_objs), dtype=np.int32)
+    overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32)
+    # "Seg" area for pascal is just the box area
+    seg_areas = np.zeros((num_objs), dtype=np.float32)
+
+    # Load object bounding boxes into a data frame.
+    for ix, obj in enumerate(objs):
+      bbox = obj.find('bndbox')
+      # Make pixel indexes 0-based
+      x1 = float(bbox.find('xmin').text) - 1
+      y1 = float(bbox.find('ymin').text) - 1
+      x2 = float(bbox.find('xmax').text) - 1
+      y2 = float(bbox.find('ymax').text) - 1
+      cls = self._class_to_ind[obj.find('name').text.lower().strip()]
+      boxes[ix, :] = [x1, y1, x2, y2]
+      gt_classes[ix] = cls
+      overlaps[ix, cls] = 1.0
+      seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1)
+
+    overlaps = scipy.sparse.csr_matrix(overlaps)
+
+    return {'boxes': boxes,
+            'gt_classes': gt_classes,
+            'gt_overlaps': overlaps,
+            'flipped': False,
+            'seg_areas': seg_areas}
+
+  def _get_comp_id(self):
+    comp_id = (self._comp_id + '_' + self._salt if self.config['use_salt']
+               else self._comp_id)
+    return comp_id
+
+  def _get_voc_results_file_template(self):
+    # VOCdevkit/results/VOC2007/Main/<comp_id>_det_test_aeroplane.txt
+    filename = self._get_comp_id() + '_det_' + self._image_set + '_{:s}.txt'
+    path = os.path.join(
+      self._devkit_path,
+      'results',
+      'VOC' + self._year,
+      'Main',
+      filename)
+    return path
+
+  def _write_voc_results_file(self, all_boxes):
+    for cls_ind, cls in enumerate(self.classes):
+      if cls == '__background__':
+        continue
+      # print('Writing {} VOC results file'.format(cls))
+      filename = self._get_voc_results_file_template().format(cls)
+      # print(filename)
+      with open(filename, 'wt') as f:
+        for im_ind, index in enumerate(self.image_index):
+          dets = np.array(all_boxes[cls_ind][im_ind])
+          if len(dets) == 0:
+            continue
+          # the VOCdevkit expects 1-based indices
+          for k in range(dets.shape[0]):
+            f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
+                    format(index, dets[k, -1],
+                           dets[k, 0] + 1, dets[k, 1] + 1,
+                           dets[k, 2] + 1, dets[k, 3] + 1))
+
+  def _do_python_eval(self, output_dir=None):
+    annopath = os.path.join(
+      self._devkit_path,
+      'VOC' + self._year,
+      'Annotations',
+      '{:s}.xml')
+    imagesetfile = os.path.join(
+      self._devkit_path,
+      'VOC' + self._year,
+      'ImageSets',
+      'Main',
+      self._image_set + '.txt')
+    cachedir = os.path.join(self._devkit_path, 'annotations_cache')
+    aps = []
+    # The PASCAL VOC metric changed in 2010
+    use_07_metric = True if int(self._year) < 2010 else False
+    print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
+    if output_dir is not None and not os.path.isdir(output_dir):
+      os.mkdir(output_dir)
+    for i, cls in enumerate(self._classes):
+      if cls == '__background__':
+        continue
+      filename = self._get_voc_results_file_template().format(cls)
+      rec, prec, ap = voc_eval(
+        filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5,
+        use_07_metric=use_07_metric, use_diff=self.config['use_diff'])
+      aps += [ap]
+      print(('AP for {} = {:.4f}'.format(cls, ap)))
+      if output_dir is not None:
+        with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f:
+          pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
+    print(('Mean AP = {:.4f}'.format(np.mean(aps))))
+    print('~~~~~~~~')
+    '''
+    print('Results:')
+    for ap in aps:
+      print(('{:.3f}'.format(ap)))
+    print(('{:.3f}'.format(np.mean(aps))))
+    print('~~~~~~~~')
+    print('')
+    print('--------------------------------------------------------------')
+    print('Results computed with the **unofficial** Python eval code.')
+    print('Results should be very close to the official MATLAB eval code.')
+    print('Recompute with `./tools/reval.py --matlab ...` for your paper.')
+    print('-- Thanks, The Management')
+    print('--------------------------------------------------------------')
+    '''
+    
+  def _do_matlab_eval(self, output_dir='output'):
+    print('-----------------------------------------------------')
+    print('Computing results with the official MATLAB eval code.')
+    print('-----------------------------------------------------')
+    path = os.path.join(cfg.ROOT_DIR, 'lib', 'datasets',
+                        'VOCdevkit-matlab-wrapper')
+    cmd = 'cd {} && '.format(path)
+    cmd += '{:s} -nodisplay -nodesktop '.format(cfg.MATLAB)
+    cmd += '-r "dbstop if error; '
+    cmd += 'voc_eval(\'{:s}\',\'{:s}\',\'{:s}\',\'{:s}\'); quit;"' \
+      .format(self._devkit_path, self._get_comp_id(),
+              self._image_set, output_dir)
+    print(('Running:\n{}'.format(cmd)))
+    status = subprocess.call(cmd, shell=True)
+
+  def evaluate_detections(self, all_boxes, output_dir=None):
+    self._write_voc_results_file(all_boxes)
+    self._do_python_eval(output_dir)
+    if self.config['matlab_eval']:
+      self._do_matlab_eval(output_dir)
+    if self.config['cleanup']:
+      for cls in self._classes:
+        if cls == '__background__':
+          continue
+        filename = self._get_voc_results_file_template().format(cls)
+        os.remove(filename)
+
+  def competition_mode(self, on):
+    if on:
+      self.config['use_salt'] = False
+      self.config['cleanup'] = False
+    else:
+      self.config['use_salt'] = True
+      self.config['cleanup'] = True
+
+
+if __name__ == '__main__':
+  from datasets.pascal_voc import pascal_voc
+
+  d = pascal_voc('trainval', '2007')
+  res = d.roidb
+  from IPython import embed;
+
+  embed()
diff --git a/src/tools/voc_eval_lib/datasets/voc_eval.py b/src/tools/voc_eval_lib/datasets/voc_eval.py
new file mode 100644
index 0000000..5a49b43
--- /dev/null
+++ b/src/tools/voc_eval_lib/datasets/voc_eval.py
@@ -0,0 +1,215 @@
+# --------------------------------------------------------
+# Fast/er R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Bharath Hariharan
+# --------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import xml.etree.ElementTree as ET
+import os
+import pickle
+import numpy as np
+
+def parse_rec(filename):
+  """ Parse a PASCAL VOC xml file """
+  tree = ET.parse(filename)
+  objects = []
+  for obj in tree.findall('object'):
+    obj_struct = {}
+    obj_struct['name'] = obj.find('name').text
+    obj_struct['pose'] = obj.find('pose').text
+    obj_struct['truncated'] = int(obj.find('truncated').text)
+    obj_struct['difficult'] = int(obj.find('difficult').text)
+    bbox = obj.find('bndbox')
+    obj_struct['bbox'] = [int(bbox.find('xmin').text),
+                          int(bbox.find('ymin').text),
+                          int(bbox.find('xmax').text),
+                          int(bbox.find('ymax').text)]
+    objects.append(obj_struct)
+
+  return objects
+
+
+def voc_ap(rec, prec, use_07_metric=False):
+  """ ap = voc_ap(rec, prec, [use_07_metric])
+  Compute VOC AP given precision and recall.
+  If use_07_metric is true, uses the
+  VOC 07 11 point method (default:False).
+  """
+  if use_07_metric:
+    # 11 point metric
+    ap = 0.
+    for t in np.arange(0., 1.1, 0.1):
+      if np.sum(rec >= t) == 0:
+        p = 0
+      else:
+        p = np.max(prec[rec >= t])
+        # print(t, p)
+      ap = ap + p / 11.
+  else:
+    # correct AP calculation
+    # first append sentinel values at the end
+    mrec = np.concatenate(([0.], rec, [1.]))
+    mpre = np.concatenate(([0.], prec, [0.]))
+
+    # compute the precision envelope
+    for i in range(mpre.size - 1, 0, -1):
+      mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+    # to calculate area under PR curve, look for points
+    # where X axis (recall) changes value
+    i = np.where(mrec[1:] != mrec[:-1])[0]
+
+    # and sum (\Delta recall) * prec
+    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+  return ap
+
+
+def voc_eval(detpath,
+             annopath,
+             imagesetfile,
+             classname,
+             cachedir,
+             ovthresh=0.5,
+             use_07_metric=False,
+             use_diff=False):
+  """rec, prec, ap = voc_eval(detpath,
+                              annopath,
+                              imagesetfile,
+                              classname,
+                              [ovthresh],
+                              [use_07_metric])
+
+  Top level function that does the PASCAL VOC evaluation.
+
+  detpath: Path to detections
+      detpath.format(classname) should produce the detection results file.
+  annopath: Path to annotations
+      annopath.format(imagename) should be the xml annotations file.
+  imagesetfile: Text file containing the list of images, one image per line.
+  classname: Category name (duh)
+  cachedir: Directory for caching the annotations
+  [ovthresh]: Overlap threshold (default = 0.5)
+  [use_07_metric]: Whether to use VOC07's 11 point AP computation
+      (default False)
+  """
+  # assumes detections are in detpath.format(classname)
+  # assumes annotations are in annopath.format(imagename)
+  # assumes imagesetfile is a text file with each line an image name
+  # cachedir caches the annotations in a pickle file
+
+  # first load gt
+  if not os.path.isdir(cachedir):
+    os.mkdir(cachedir)
+  cachefile = os.path.join(cachedir, '%s_annots.pkl' % imagesetfile)
+  # read list of images
+  with open(imagesetfile, 'r') as f:
+    lines = f.readlines()
+  imagenames = [x.strip() for x in lines]
+
+  if not os.path.isfile(cachefile):
+    # load annotations
+    recs = {}
+    for i, imagename in enumerate(imagenames):
+      recs[imagename] = parse_rec(annopath.format(imagename))
+      if i % 100 == 0:
+        print('Reading annotation for {:d}/{:d}'.format(
+          i + 1, len(imagenames)))
+    # save
+    print('Saving cached annotations to {:s}'.format(cachefile))
+    with open(cachefile, 'wb') as f:
+      pickle.dump(recs, f)
+  else:
+    # load
+    with open(cachefile, 'rb') as f:
+      try:
+        recs = pickle.load(f)
+      except:
+        recs = pickle.load(f, encoding='bytes')
+
+  # extract gt objects for this class
+  class_recs = {}
+  npos = 0
+  for imagename in imagenames:
+    R = [obj for obj in recs[imagename] if obj['name'] == classname]
+    bbox = np.array([x['bbox'] for x in R])
+    if use_diff:
+      difficult = np.array([False for x in R]).astype(np.bool)
+    else:
+      difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
+    det = [False] * len(R)
+    npos = npos + sum(~difficult)
+    class_recs[imagename] = {'bbox': bbox,
+                             'difficult': difficult,
+                             'det': det}
+
+  # read dets
+  detfile = detpath.format(classname)
+  with open(detfile, 'r') as f:
+    lines = f.readlines()
+
+  splitlines = [x.strip().split(' ') for x in lines]
+  image_ids = [x[0] for x in splitlines]
+  confidence = np.array([float(x[1]) for x in splitlines])
+  BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
+
+  nd = len(image_ids)
+  tp = np.zeros(nd)
+  fp = np.zeros(nd)
+
+  if BB.shape[0] > 0:
+    # sort by confidence
+    sorted_ind = np.argsort(-confidence)
+    sorted_scores = np.sort(-confidence)
+    BB = BB[sorted_ind, :]
+    image_ids = [image_ids[x] for x in sorted_ind]
+
+    # go down dets and mark TPs and FPs
+    for d in range(nd):
+      R = class_recs[image_ids[d]]
+      bb = BB[d, :].astype(float)
+      ovmax = -np.inf
+      BBGT = R['bbox'].astype(float)
+
+      if BBGT.size > 0:
+        # compute overlaps
+        # intersection
+        ixmin = np.maximum(BBGT[:, 0], bb[0])
+        iymin = np.maximum(BBGT[:, 1], bb[1])
+        ixmax = np.minimum(BBGT[:, 2], bb[2])
+        iymax = np.minimum(BBGT[:, 3], bb[3])
+        iw = np.maximum(ixmax - ixmin + 1., 0.)
+        ih = np.maximum(iymax - iymin + 1., 0.)
+        inters = iw * ih
+
+        # union
+        uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
+               (BBGT[:, 2] - BBGT[:, 0] + 1.) *
+               (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
+
+        overlaps = inters / uni
+        ovmax = np.max(overlaps)
+        jmax = np.argmax(overlaps)
+
+      if ovmax > ovthresh:
+        if not R['difficult'][jmax]:
+          if not R['det'][jmax]:
+            tp[d] = 1.
+            R['det'][jmax] = 1
+          else:
+            fp[d] = 1.
+      else:
+        fp[d] = 1.
+
+  # compute precision recall
+  fp = np.cumsum(fp)
+  tp = np.cumsum(tp)
+  rec = tp / float(npos)
+  # avoid divide by zero in case the first detection matches a difficult
+  # ground truth
+  prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+  ap = voc_ap(rec, prec, use_07_metric)
+
+  return rec, prec, ap
diff --git a/src/tools/voc_eval_lib/model/__init__.py b/src/tools/voc_eval_lib/model/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/tools/voc_eval_lib/model/bbox_transform.py b/src/tools/voc_eval_lib/model/bbox_transform.py
new file mode 100644
index 0000000..5e617f2
--- /dev/null
+++ b/src/tools/voc_eval_lib/model/bbox_transform.py
@@ -0,0 +1,85 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+def bbox_transform(ex_rois, gt_rois):
+  ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
+  ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
+  ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
+  ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
+
+  gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
+  gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
+  gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
+  gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
+
+  targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
+  targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
+  targets_dw = np.log(gt_widths / ex_widths)
+  targets_dh = np.log(gt_heights / ex_heights)
+
+  targets = np.vstack(
+    (targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
+  return targets
+
+
+def bbox_transform_inv(boxes, deltas):
+  if boxes.shape[0] == 0:
+    return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
+
+  boxes = boxes.astype(deltas.dtype, copy=False)
+  widths = boxes[:, 2] - boxes[:, 0] + 1.0
+  heights = boxes[:, 3] - boxes[:, 1] + 1.0
+  ctr_x = boxes[:, 0] + 0.5 * widths
+  ctr_y = boxes[:, 1] + 0.5 * heights
+
+  dx = deltas[:, 0::4]
+  dy = deltas[:, 1::4]
+  dw = deltas[:, 2::4]
+  dh = deltas[:, 3::4]
+  
+  pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
+  pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+  pred_w = np.exp(dw) * widths[:, np.newaxis]
+  pred_h = np.exp(dh) * heights[:, np.newaxis]
+
+  pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
+  # x1
+  pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
+  # y1
+  pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
+  # x2
+  pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w
+  # y2
+  pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h
+
+  return pred_boxes
+
+
+def clip_boxes(boxes, im_shape):
+  """
+  Clip boxes to image boundaries.
+  """
+
+  # x1 >= 0
+  boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
+  # y1 >= 0
+  boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
+  # x2 < im_shape[1]
+  boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
+  # y2 < im_shape[0]
+  boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
+  return boxes
+
+
+
+
+
diff --git a/src/tools/voc_eval_lib/model/config.py b/src/tools/voc_eval_lib/model/config.py
new file mode 100644
index 0000000..4dfd99c
--- /dev/null
+++ b/src/tools/voc_eval_lib/model/config.py
@@ -0,0 +1,387 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import os.path as osp
+import numpy as np
+# `pip install easydict` if you don't have it
+from easydict import EasyDict as edict
+
+__C = edict()
+# Consumers can get config by:
+#   from fast_rcnn_config import cfg
+cfg = __C
+
+#
+# Training options
+#
+__C.TRAIN = edict()
+
+# Initial learning rate
+__C.TRAIN.LEARNING_RATE = 0.001
+
+# Momentum
+__C.TRAIN.MOMENTUM = 0.9
+
+# Weight decay, for regularization
+__C.TRAIN.WEIGHT_DECAY = 0.0001
+
+# Factor for reducing the learning rate
+__C.TRAIN.GAMMA = 0.1
+
+# Step size for reducing the learning rate, currently only support one step
+__C.TRAIN.STEPSIZE = [30000]
+
+# Iteration intervals for showing the loss during training, on command line interface
+__C.TRAIN.DISPLAY = 10
+
+# Whether to double the learning rate for bias
+__C.TRAIN.DOUBLE_BIAS = True
+
+# Whether to initialize the weights with truncated normal distribution 
+__C.TRAIN.TRUNCATED = False
+
+# Whether to have weight decay on bias as well
+__C.TRAIN.BIAS_DECAY = False
+
+# Whether to add ground truth boxes to the pool when sampling regions
+__C.TRAIN.USE_GT = False
+
+# Whether to use aspect-ratio grouping of training images, introduced merely for saving
+# GPU memory
+__C.TRAIN.ASPECT_GROUPING = False
+
+# The number of snapshots kept, older ones are deleted to save space
+__C.TRAIN.SNAPSHOT_KEPT = 3
+
+# The time interval for saving tensorflow summaries
+__C.TRAIN.SUMMARY_INTERVAL = 180
+
+# Scale to use during training (can list multiple scales)
+# The scale is the pixel size of an image's shortest side
+__C.TRAIN.SCALES = (600,)
+
+# Max pixel size of the longest side of a scaled input image
+__C.TRAIN.MAX_SIZE = 1000
+
+# Images to use per minibatch
+__C.TRAIN.IMS_PER_BATCH = 1
+
+# Minibatch size (number of regions of interest [ROIs])
+__C.TRAIN.BATCH_SIZE = 128
+
+# Fraction of minibatch that is labeled foreground (i.e. class > 0)
+__C.TRAIN.FG_FRACTION = 0.25
+
+# Overlap threshold for a ROI to be considered foreground (if >= FG_THRESH)
+__C.TRAIN.FG_THRESH = 0.5
+
+# Overlap threshold for a ROI to be considered background (class = 0 if
+# overlap in [LO, HI))
+__C.TRAIN.BG_THRESH_HI = 0.5
+__C.TRAIN.BG_THRESH_LO = 0.1
+
+# Use horizontally-flipped images during training?
+__C.TRAIN.USE_FLIPPED = True
+
+# Train bounding-box regressors
+__C.TRAIN.BBOX_REG = True
+
+# Overlap required between a ROI and ground-truth box in order for that ROI to
+# be used as a bounding-box regression training example
+__C.TRAIN.BBOX_THRESH = 0.5
+
+# Iterations between snapshots
+__C.TRAIN.SNAPSHOT_ITERS = 5000
+
+# solver.prototxt specifies the snapshot path prefix, this adds an optional
+# infix to yield the path: <prefix>[_<infix>]_iters_XYZ.caffemodel
+__C.TRAIN.SNAPSHOT_PREFIX = 'res101_faster_rcnn'
+
+# Normalize the targets (subtract empirical mean, divide by empirical stddev)
+__C.TRAIN.BBOX_NORMALIZE_TARGETS = True
+
+# Deprecated (inside weights)
+__C.TRAIN.BBOX_INSIDE_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
+
+# Normalize the targets using "precomputed" (or made up) means and stdevs
+# (BBOX_NORMALIZE_TARGETS must also be True)
+__C.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED = True
+
+__C.TRAIN.BBOX_NORMALIZE_MEANS = (0.0, 0.0, 0.0, 0.0)
+
+__C.TRAIN.BBOX_NORMALIZE_STDS = (0.1, 0.1, 0.2, 0.2)
+
+# Train using these proposals
+__C.TRAIN.PROPOSAL_METHOD = 'gt'
+
+# Make minibatches from images that have similar aspect ratios (i.e. both
+# tall and thin or both short and wide) in order to avoid wasting computation
+# on zero-padding.
+
+# Use RPN to detect objects
+__C.TRAIN.HAS_RPN = True
+
+# IOU >= thresh: positive example
+__C.TRAIN.RPN_POSITIVE_OVERLAP = 0.7
+
+# IOU < thresh: negative example
+__C.TRAIN.RPN_NEGATIVE_OVERLAP = 0.3
+
+# If an anchor satisfied by positive and negative conditions set to negative
+__C.TRAIN.RPN_CLOBBER_POSITIVES = False
+
+# Max number of foreground examples
+__C.TRAIN.RPN_FG_FRACTION = 0.5
+
+# Total number of examples
+__C.TRAIN.RPN_BATCHSIZE = 256
+
+# NMS threshold used on RPN proposals
+__C.TRAIN.RPN_NMS_THRESH = 0.7
+
+# Number of top scoring boxes to keep before apply NMS to RPN proposals
+__C.TRAIN.RPN_PRE_NMS_TOP_N = 12000
+
+# Number of top scoring boxes to keep after applying NMS to RPN proposals
+__C.TRAIN.RPN_POST_NMS_TOP_N = 2000
+
+# Deprecated (outside weights)
+__C.TRAIN.RPN_BBOX_INSIDE_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
+
+# Give the positive RPN examples weight of p * 1 / {num positives}
+# and give negatives a weight of (1 - p)
+# Set to -1.0 to use uniform example weighting
+__C.TRAIN.RPN_POSITIVE_WEIGHT = -1.0
+
+# Whether to use all ground truth bounding boxes for training, 
+# For COCO, setting USE_ALL_GT to False will exclude boxes that are flagged as ''iscrowd''
+__C.TRAIN.USE_ALL_GT = True
+
+#
+# Testing options
+#
+__C.TEST = edict()
+
+# Scale to use during testing (can NOT list multiple scales)
+# The scale is the pixel size of an image's shortest side
+__C.TEST.SCALES = (600,)
+
+# Max pixel size of the longest side of a scaled input image
+__C.TEST.MAX_SIZE = 1000
+
+# Overlap threshold used for non-maximum suppression (suppress boxes with
+# IoU >= this threshold)
+__C.TEST.NMS = 0.3
+
+# Experimental: treat the (K+1) units in the cls_score layer as linear
+# predictors (trained, eg, with one-vs-rest SVMs).
+__C.TEST.SVM = False
+
+# Test using bounding-box regressors
+__C.TEST.BBOX_REG = True
+
+# Propose boxes
+__C.TEST.HAS_RPN = False
+
+# Test using these proposals
+__C.TEST.PROPOSAL_METHOD = 'gt'
+
+## NMS threshold used on RPN proposals
+__C.TEST.RPN_NMS_THRESH = 0.7
+
+# Number of top scoring boxes to keep before apply NMS to RPN proposals
+__C.TEST.RPN_PRE_NMS_TOP_N = 6000
+
+# Number of top scoring boxes to keep after applying NMS to RPN proposals
+__C.TEST.RPN_POST_NMS_TOP_N = 300
+
+# Proposal height and width both need to be greater than RPN_MIN_SIZE (at orig image scale)
+# __C.TEST.RPN_MIN_SIZE = 16
+
+# Testing mode, default to be 'nms', 'top' is slower but better
+# See report for details
+__C.TEST.MODE = 'nms'
+
+# Only useful when TEST.MODE is 'top', specifies the number of top proposals to select
+__C.TEST.RPN_TOP_N = 5000
+
+#
+# ResNet options
+#
+
+__C.RESNET = edict()
+
+# Option to set if max-pooling is appended after crop_and_resize. 
+# if true, the region will be resized to a square of 2xPOOLING_SIZE, 
+# then 2x2 max-pooling is applied; otherwise the region will be directly
+# resized to a square of POOLING_SIZE
+__C.RESNET.MAX_POOL = False
+
+# Number of fixed blocks during training, by default the first of all 4 blocks is fixed
+# Range: 0 (none) to 3 (all)
+__C.RESNET.FIXED_BLOCKS = 1
+
+#
+# MobileNet options
+#
+
+__C.MOBILENET = edict()
+
+# Whether to regularize the depth-wise filters during training
+__C.MOBILENET.REGU_DEPTH = False
+
+# Number of fixed layers during training, by default the bottom 5 of 14 layers is fixed
+# Range: 0 (none) to 12 (all)
+__C.MOBILENET.FIXED_LAYERS = 5
+
+# Weight decay for the mobilenet weights
+__C.MOBILENET.WEIGHT_DECAY = 0.00004
+
+# Depth multiplier
+__C.MOBILENET.DEPTH_MULTIPLIER = 1.
+
+#
+# MISC
+#
+
+# Pixel mean values (BGR order) as a (1, 1, 3) array
+# We use the same pixel mean for all networks even though it's not exactly what
+# they were trained with
+__C.PIXEL_MEANS = np.array([[[102.9801, 115.9465, 122.7717]]])
+
+# For reproducibility
+__C.RNG_SEED = 3
+
+# Root directory of project
+__C.ROOT_DIR = osp.abspath(osp.join(osp.dirname(__file__), '..', '..', '..', '..'))
+
+# Data directory
+__C.DATA_DIR = osp.abspath(osp.join(__C.ROOT_DIR, 'data'))
+
+# Name (or path to) the matlab executable
+__C.MATLAB = 'matlab'
+
+# Place outputs under an experiments directory
+__C.EXP_DIR = 'default'
+
+# Use GPU implementation of non-maximum suppression
+__C.USE_GPU_NMS = True
+
+# Use an end-to-end tensorflow model.
+# Note: models in E2E tensorflow mode have only been tested in feed-forward mode,
+#       but these models are exportable to other tensorflow instances as GraphDef files.
+__C.USE_E2E_TF = True
+
+# Default pooling mode, only 'crop' is available
+__C.POOLING_MODE = 'crop'
+
+# Size of the pooled region after RoI pooling
+__C.POOLING_SIZE = 7
+
+# Anchor scales for RPN
+__C.ANCHOR_SCALES = [8,16,32]
+
+# Anchor ratios for RPN
+__C.ANCHOR_RATIOS = [0.5,1,2]
+
+# Number of filters for the RPN layer
+__C.RPN_CHANNELS = 512
+
+
+def get_output_dir(imdb, weights_filename):
+  """Return the directory where experimental artifacts are placed.
+  If the directory does not exist, it is created.
+
+  A canonical path is built using the name from an imdb and a network
+  (if not None).
+  """
+  outdir = osp.abspath(osp.join(__C.ROOT_DIR, 'output', __C.EXP_DIR, imdb.name))
+  if weights_filename is None:
+    weights_filename = 'default'
+  outdir = osp.join(outdir, weights_filename)
+  if not os.path.exists(outdir):
+    os.makedirs(outdir)
+  return outdir
+
+
+def get_output_tb_dir(imdb, weights_filename):
+  """Return the directory where tensorflow summaries are placed.
+  If the directory does not exist, it is created.
+
+  A canonical path is built using the name from an imdb and a network
+  (if not None).
+  """
+  outdir = osp.abspath(osp.join(__C.ROOT_DIR, 'tensorboard', __C.EXP_DIR, imdb.name))
+  if weights_filename is None:
+    weights_filename = 'default'
+  outdir = osp.join(outdir, weights_filename)
+  if not os.path.exists(outdir):
+    os.makedirs(outdir)
+  return outdir
+
+
+def _merge_a_into_b(a, b):
+  """Merge config dictionary a into config dictionary b, clobbering the
+  options in b whenever they are also specified in a.
+  """
+  if type(a) is not edict:
+    return
+
+  for k, v in a.items():
+    # a must specify keys that are in b
+    if k not in b:
+      raise KeyError('{} is not a valid config key'.format(k))
+
+    # the types must match, too
+    old_type = type(b[k])
+    if old_type is not type(v):
+      if isinstance(b[k], np.ndarray):
+        v = np.array(v, dtype=b[k].dtype)
+      else:
+        raise ValueError(('Type mismatch ({} vs. {}) '
+                          'for config key: {}').format(type(b[k]),
+                                                       type(v), k))
+
+    # recursively merge dicts
+    if type(v) is edict:
+      try:
+        _merge_a_into_b(a[k], b[k])
+      except:
+        print(('Error under config key: {}'.format(k)))
+        raise
+    else:
+      b[k] = v
+
+
+def cfg_from_file(filename):
+  """Load a config file and merge it into the default options."""
+  import yaml
+  with open(filename, 'r') as f:
+    yaml_cfg = edict(yaml.load(f))
+
+  _merge_a_into_b(yaml_cfg, __C)
+
+
+def cfg_from_list(cfg_list):
+  """Set config keys via list (e.g., from command line)."""
+  from ast import literal_eval
+  assert len(cfg_list) % 2 == 0
+  for k, v in zip(cfg_list[0::2], cfg_list[1::2]):
+    key_list = k.split('.')
+    d = __C
+    for subkey in key_list[:-1]:
+      assert subkey in d
+      d = d[subkey]
+    subkey = key_list[-1]
+    assert subkey in d
+    try:
+      value = literal_eval(v)
+    except:
+      # handle the case when v is a string literal
+      value = v
+    assert type(value) == type(d[subkey]), \
+      'type {} does not match original type {}'.format(
+        type(value), type(d[subkey]))
+    d[subkey] = value
diff --git a/src/tools/voc_eval_lib/model/nms_wrapper.py b/src/tools/voc_eval_lib/model/nms_wrapper.py
new file mode 100644
index 0000000..f594922
--- /dev/null
+++ b/src/tools/voc_eval_lib/model/nms_wrapper.py
@@ -0,0 +1,23 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from model.config import cfg
+from nms.gpu_nms import gpu_nms
+from nms.cpu_nms import cpu_nms
+
+def nms(dets, thresh, force_cpu=False):
+  """Dispatch to either CPU or GPU NMS implementations."""
+
+  if dets.shape[0] == 0:
+    return []
+  if cfg.USE_GPU_NMS and not force_cpu:
+    return gpu_nms(dets, thresh, device_id=0)
+  else:
+    return cpu_nms(dets, thresh)
diff --git a/src/tools/voc_eval_lib/model/test.py b/src/tools/voc_eval_lib/model/test.py
new file mode 100644
index 0000000..4d4bf15
--- /dev/null
+++ b/src/tools/voc_eval_lib/model/test.py
@@ -0,0 +1,193 @@
+# --------------------------------------------------------
+# Tensorflow Faster R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Xinlei Chen
+# --------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import cv2
+import numpy as np
+try:
+  import cPickle as pickle
+except ImportError:
+  import pickle
+import os
+import math
+
+from utils.timer import Timer
+from utils.blob import im_list_to_blob
+
+from model.config import cfg, get_output_dir
+from model.bbox_transform import clip_boxes, bbox_transform_inv
+# from model.nms_wrapper import nms  # need to compile cython nms before import nms
+nms = None  # not needed in pascal evaluation
+
+def _get_image_blob(im):
+  """Converts an image into a network input.
+  Arguments:
+    im (ndarray): a color image in BGR order
+  Returns:
+    blob (ndarray): a data blob holding an image pyramid
+    im_scale_factors (list): list of image scales (relative to im) used
+      in the image pyramid
+  """
+  im_orig = im.astype(np.float32, copy=True)
+  im_orig -= cfg.PIXEL_MEANS
+
+  im_shape = im_orig.shape
+  im_size_min = np.min(im_shape[0:2])
+  im_size_max = np.max(im_shape[0:2])
+
+  processed_ims = []
+  im_scale_factors = []
+
+  for target_size in cfg.TEST.SCALES:
+    im_scale = float(target_size) / float(im_size_min)
+    # Prevent the biggest axis from being more than MAX_SIZE
+    if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE:
+      im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max)
+    im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale,
+            interpolation=cv2.INTER_LINEAR)
+    im_scale_factors.append(im_scale)
+    processed_ims.append(im)
+
+  # Create a blob to hold the input images
+  blob = im_list_to_blob(processed_ims)
+
+  return blob, np.array(im_scale_factors)
+
+def _get_blobs(im):
+  """Convert an image and RoIs within that image into network inputs."""
+  blobs = {}
+  blobs['data'], im_scale_factors = _get_image_blob(im)
+
+  return blobs, im_scale_factors
+
+def _clip_boxes(boxes, im_shape):
+  """Clip boxes to image boundaries."""
+  # x1 >= 0
+  boxes[:, 0::4] = np.maximum(boxes[:, 0::4], 0)
+  # y1 >= 0
+  boxes[:, 1::4] = np.maximum(boxes[:, 1::4], 0)
+  # x2 < im_shape[1]
+  boxes[:, 2::4] = np.minimum(boxes[:, 2::4], im_shape[1] - 1)
+  # y2 < im_shape[0]
+  boxes[:, 3::4] = np.minimum(boxes[:, 3::4], im_shape[0] - 1)
+  return boxes
+
+def _rescale_boxes(boxes, inds, scales):
+  """Rescale boxes according to image rescaling."""
+  for i in range(boxes.shape[0]):
+    boxes[i,:] = boxes[i,:] / scales[int(inds[i])]
+
+  return boxes
+
+def im_detect(sess, net, im):
+  blobs, im_scales = _get_blobs(im)
+  assert len(im_scales) == 1, "Only single-image batch implemented"
+
+  im_blob = blobs['data']
+  blobs['im_info'] = np.array([im_blob.shape[1], im_blob.shape[2], im_scales[0]], dtype=np.float32)
+
+  _, scores, bbox_pred, rois = net.test_image(sess, blobs['data'], blobs['im_info'])
+  
+  boxes = rois[:, 1:5] / im_scales[0]
+  scores = np.reshape(scores, [scores.shape[0], -1])
+  bbox_pred = np.reshape(bbox_pred, [bbox_pred.shape[0], -1])
+  if cfg.TEST.BBOX_REG:
+    # Apply bounding-box regression deltas
+    box_deltas = bbox_pred
+    pred_boxes = bbox_transform_inv(boxes, box_deltas)
+    pred_boxes = _clip_boxes(pred_boxes, im.shape)
+  else:
+    # Simply repeat the boxes, once for each class
+    pred_boxes = np.tile(boxes, (1, scores.shape[1]))
+
+  return scores, pred_boxes
+
+def apply_nms(all_boxes, thresh):
+  """Apply non-maximum suppression to all predicted boxes output by the
+  test_net method.
+  """
+  num_classes = len(all_boxes)
+  num_images = len(all_boxes[0])
+  nms_boxes = [[[] for _ in range(num_images)] for _ in range(num_classes)]
+  for cls_ind in range(num_classes):
+    for im_ind in range(num_images):
+      dets = np.array(all_boxes[cls_ind][im_ind], dtype=np.float32)
+      if len(dets) == 0:
+        continue
+      #print('dets', dets)
+      x1 = dets[:, 0]
+      y1 = dets[:, 1]
+      x2 = dets[:, 2]
+      y2 = dets[:, 3]
+      scores = dets[:, 4]
+      inds = np.where((x2 > x1) & (y2 > y1))[0]
+      dets = dets[inds,:]
+      if dets == []:
+        continue
+
+      keep = nms(dets, thresh)
+      if len(keep) == 0:
+        continue
+      nms_boxes[cls_ind][im_ind] = dets[keep, :].copy()
+  return nms_boxes
+
+def test_net(sess, net, imdb, weights_filename, max_per_image=100, thresh=0.):
+  np.random.seed(cfg.RNG_SEED)
+  """Test a Fast R-CNN network on an image database."""
+  num_images = len(imdb.image_index)
+  # all detections are collected into:
+  #  all_boxes[cls][image] = N x 5 array of detections in
+  #  (x1, y1, x2, y2, score)
+  all_boxes = [[[] for _ in range(num_images)]
+         for _ in range(imdb.num_classes)]
+
+  output_dir = get_output_dir(imdb, weights_filename)
+  # timers
+  _t = {'im_detect' : Timer(), 'misc' : Timer()}
+
+  for i in range(num_images):
+    im = cv2.imread(imdb.image_path_at(i))
+
+    _t['im_detect'].tic()
+    scores, boxes = im_detect(sess, net, im)
+    _t['im_detect'].toc()
+
+    _t['misc'].tic()
+
+    # skip j = 0, because it's the background class
+    for j in range(1, imdb.num_classes):
+      inds = np.where(scores[:, j] > thresh)[0]
+      cls_scores = scores[inds, j]
+      cls_boxes = boxes[inds, j*4:(j+1)*4]
+      cls_dets = np.hstack((cls_boxes, cls_scores[:, np.newaxis])) \
+        .astype(np.float32, copy=False)
+      keep = nms(cls_dets, cfg.TEST.NMS)
+      cls_dets = cls_dets[keep, :]
+      all_boxes[j][i] = cls_dets
+
+    # Limit to max_per_image detections *over all classes*
+    if max_per_image > 0:
+      image_scores = np.hstack([all_boxes[j][i][:, -1]
+                    for j in range(1, imdb.num_classes)])
+      if len(image_scores) > max_per_image:
+        image_thresh = np.sort(image_scores)[-max_per_image]
+        for j in range(1, imdb.num_classes):
+          keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0]
+          all_boxes[j][i] = all_boxes[j][i][keep, :]
+    _t['misc'].toc()
+
+    print('im_detect: {:d}/{:d} {:.3f}s {:.3f}s' \
+        .format(i + 1, num_images, _t['im_detect'].average_time,
+            _t['misc'].average_time))
+
+  det_file = os.path.join(output_dir, 'detections.pkl')
+  with open(det_file, 'wb') as f:
+    pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL)
+
+  print('Evaluating detections')
+  imdb.evaluate_detections(all_boxes, output_dir)
diff --git a/src/tools/voc_eval_lib/nms/.gitignore b/src/tools/voc_eval_lib/nms/.gitignore
new file mode 100644
index 0000000..e69de29
diff --git a/src/tools/voc_eval_lib/nms/__init__.py b/src/tools/voc_eval_lib/nms/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/tools/voc_eval_lib/nms/cpu_nms.c b/src/tools/voc_eval_lib/nms/cpu_nms.c
new file mode 100644
index 0000000..96f7ec4
--- /dev/null
+++ b/src/tools/voc_eval_lib/nms/cpu_nms.c
@@ -0,0 +1,6869 @@
+/* Generated by Cython 0.20.1 on Wed Oct  5 13:15:30 2016 */
+
+#define PY_SSIZE_T_CLEAN
+#ifndef CYTHON_USE_PYLONG_INTERNALS
+#ifdef PYLONG_BITS_IN_DIGIT
+#define CYTHON_USE_PYLONG_INTERNALS 0
+#else
+#include "pyconfig.h"
+#ifdef PYLONG_BITS_IN_DIGIT
+#define CYTHON_USE_PYLONG_INTERNALS 1
+#else
+#define CYTHON_USE_PYLONG_INTERNALS 0
+#endif
+#endif
+#endif
+#include "Python.h"
+#ifndef Py_PYTHON_H
+    #error Python headers needed to compile C extensions, please install development version of Python.
+#elif PY_VERSION_HEX < 0x02040000
+    #error Cython requires Python 2.4+.
+#else
+#define CYTHON_ABI "0_20_1"
+#include <stddef.h> /* For offsetof */
+#ifndef offsetof
+#define offsetof(type, member) ( (size_t) & ((type*)0) -> member )
+#endif
+#if !defined(WIN32) && !defined(MS_WINDOWS)
+  #ifndef __stdcall
+    #define __stdcall
+  #endif
+  #ifndef __cdecl
+    #define __cdecl
+  #endif
+  #ifndef __fastcall
+    #define __fastcall
+  #endif
+#endif
+#ifndef DL_IMPORT
+  #define DL_IMPORT(t) t
+#endif
+#ifndef DL_EXPORT
+  #define DL_EXPORT(t) t
+#endif
+#ifndef PY_LONG_LONG
+  #define PY_LONG_LONG LONG_LONG
+#endif
+#ifndef Py_HUGE_VAL
+  #define Py_HUGE_VAL HUGE_VAL
+#endif
+#ifdef PYPY_VERSION
+#define CYTHON_COMPILING_IN_PYPY 1
+#define CYTHON_COMPILING_IN_CPYTHON 0
+#else
+#define CYTHON_COMPILING_IN_PYPY 0
+#define CYTHON_COMPILING_IN_CPYTHON 1
+#endif
+#if CYTHON_COMPILING_IN_PYPY
+#define Py_OptimizeFlag 0
+#endif
+#if PY_VERSION_HEX < 0x02050000
+  typedef int Py_ssize_t;
+  #define PY_SSIZE_T_MAX INT_MAX
+  #define PY_SSIZE_T_MIN INT_MIN
+  #define PY_FORMAT_SIZE_T ""
+  #define CYTHON_FORMAT_SSIZE_T ""
+  #define PyInt_FromSsize_t(z) PyInt_FromLong(z)
+  #define PyInt_AsSsize_t(o)   __Pyx_PyInt_As_int(o)
+  #define PyNumber_Index(o)    ((PyNumber_Check(o) && !PyFloat_Check(o)) ? PyNumber_Int(o) : \
+                                (PyErr_Format(PyExc_TypeError, \
+                                              "expected index value, got %.200s", Py_TYPE(o)->tp_name), \
+                                 (PyObject*)0))
+  #define __Pyx_PyIndex_Check(o) (PyNumber_Check(o) && !PyFloat_Check(o) && \
+                                  !PyComplex_Check(o))
+  #define PyIndex_Check __Pyx_PyIndex_Check
+  #define PyErr_WarnEx(category, message, stacklevel) PyErr_Warn(category, message)
+  #define __PYX_BUILD_PY_SSIZE_T "i"
+#else
+  #define __PYX_BUILD_PY_SSIZE_T "n"
+  #define CYTHON_FORMAT_SSIZE_T "z"
+  #define __Pyx_PyIndex_Check PyIndex_Check
+#endif
+#if PY_VERSION_HEX < 0x02060000
+  #define Py_REFCNT(ob) (((PyObject*)(ob))->ob_refcnt)
+  #define Py_TYPE(ob)   (((PyObject*)(ob))->ob_type)
+  #define Py_SIZE(ob)   (((PyVarObject*)(ob))->ob_size)
+  #define PyVarObject_HEAD_INIT(type, size) \
+          PyObject_HEAD_INIT(type) size,
+  #define PyType_Modified(t)
+  typedef struct {
+     void *buf;
+     PyObject *obj;
+     Py_ssize_t len;
+     Py_ssize_t itemsize;
+     int readonly;
+     int ndim;
+     char *format;
+     Py_ssize_t *shape;
+     Py_ssize_t *strides;
+     Py_ssize_t *suboffsets;
+     void *internal;
+  } Py_buffer;
+  #define PyBUF_SIMPLE 0
+  #define PyBUF_WRITABLE 0x0001
+  #define PyBUF_FORMAT 0x0004
+  #define PyBUF_ND 0x0008
+  #define PyBUF_STRIDES (0x0010 | PyBUF_ND)
+  #define PyBUF_C_CONTIGUOUS (0x0020 | PyBUF_STRIDES)
+  #define PyBUF_F_CONTIGUOUS (0x0040 | PyBUF_STRIDES)
+  #define PyBUF_ANY_CONTIGUOUS (0x0080 | PyBUF_STRIDES)
+  #define PyBUF_INDIRECT (0x0100 | PyBUF_STRIDES)
+  #define PyBUF_RECORDS (PyBUF_STRIDES | PyBUF_FORMAT | PyBUF_WRITABLE)
+  #define PyBUF_FULL (PyBUF_INDIRECT | PyBUF_FORMAT | PyBUF_WRITABLE)
+  typedef int (*getbufferproc)(PyObject *, Py_buffer *, int);
+  typedef void (*releasebufferproc)(PyObject *, Py_buffer *);
+#endif
+#if PY_MAJOR_VERSION < 3
+  #define __Pyx_BUILTIN_MODULE_NAME "__builtin__"
+  #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos) \
+          PyCode_New(a+k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+  #define __Pyx_DefaultClassType PyClass_Type
+#else
+  #define __Pyx_BUILTIN_MODULE_NAME "builtins"
+  #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos) \
+          PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+  #define __Pyx_DefaultClassType PyType_Type
+#endif
+#if PY_VERSION_HEX < 0x02060000
+  #define PyUnicode_FromString(s) PyUnicode_Decode(s, strlen(s), "UTF-8", "strict")
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define Py_TPFLAGS_CHECKTYPES 0
+  #define Py_TPFLAGS_HAVE_INDEX 0
+#endif
+#if (PY_VERSION_HEX < 0x02060000) || (PY_MAJOR_VERSION >= 3)
+  #define Py_TPFLAGS_HAVE_NEWBUFFER 0
+#endif
+#if PY_VERSION_HEX < 0x02060000
+  #define Py_TPFLAGS_HAVE_VERSION_TAG 0
+#endif
+#if PY_VERSION_HEX < 0x02060000 && !defined(Py_TPFLAGS_IS_ABSTRACT)
+  #define Py_TPFLAGS_IS_ABSTRACT 0
+#endif
+#if PY_VERSION_HEX < 0x030400a1 && !defined(Py_TPFLAGS_HAVE_FINALIZE)
+  #define Py_TPFLAGS_HAVE_FINALIZE 0
+#endif
+#if PY_VERSION_HEX > 0x03030000 && defined(PyUnicode_KIND)
+  #define CYTHON_PEP393_ENABLED 1
+  #define __Pyx_PyUnicode_READY(op)       (likely(PyUnicode_IS_READY(op)) ? \
+                                              0 : _PyUnicode_Ready((PyObject *)(op)))
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_LENGTH(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) PyUnicode_READ_CHAR(u, i)
+  #define __Pyx_PyUnicode_KIND(u)         PyUnicode_KIND(u)
+  #define __Pyx_PyUnicode_DATA(u)         PyUnicode_DATA(u)
+  #define __Pyx_PyUnicode_READ(k, d, i)   PyUnicode_READ(k, d, i)
+#else
+  #define CYTHON_PEP393_ENABLED 0
+  #define __Pyx_PyUnicode_READY(op)       (0)
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_SIZE(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) ((Py_UCS4)(PyUnicode_AS_UNICODE(u)[i]))
+  #define __Pyx_PyUnicode_KIND(u)         (sizeof(Py_UNICODE))
+  #define __Pyx_PyUnicode_DATA(u)         ((void*)PyUnicode_AS_UNICODE(u))
+  #define __Pyx_PyUnicode_READ(k, d, i)   ((void)(k), (Py_UCS4)(((Py_UNICODE*)d)[i]))
+#endif
+#if CYTHON_COMPILING_IN_PYPY
+  #define __Pyx_PyUnicode_Concat(a, b)      PyNumber_Add(a, b)
+  #define __Pyx_PyUnicode_ConcatSafe(a, b)  PyNumber_Add(a, b)
+#else
+  #define __Pyx_PyUnicode_Concat(a, b)      PyUnicode_Concat(a, b)
+  #define __Pyx_PyUnicode_ConcatSafe(a, b)  ((unlikely((a) == Py_None) || unlikely((b) == Py_None)) ? \
+      PyNumber_Add(a, b) : __Pyx_PyUnicode_Concat(a, b))
+#endif
+#define __Pyx_PyString_FormatSafe(a, b)  ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : __Pyx_PyString_Format(a, b))
+#define __Pyx_PyUnicode_FormatSafe(a, b)  ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : PyUnicode_Format(a, b))
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyString_Format(a, b)  PyUnicode_Format(a, b)
+#else
+  #define __Pyx_PyString_Format(a, b)  PyString_Format(a, b)
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyBaseString_Type            PyUnicode_Type
+  #define PyStringObject               PyUnicodeObject
+  #define PyString_Type                PyUnicode_Type
+  #define PyString_Check               PyUnicode_Check
+  #define PyString_CheckExact          PyUnicode_CheckExact
+#endif
+#if PY_VERSION_HEX < 0x02060000
+  #define PyBytesObject                PyStringObject
+  #define PyBytes_Type                 PyString_Type
+  #define PyBytes_Check                PyString_Check
+  #define PyBytes_CheckExact           PyString_CheckExact
+  #define PyBytes_FromString           PyString_FromString
+  #define PyBytes_FromStringAndSize    PyString_FromStringAndSize
+  #define PyBytes_FromFormat           PyString_FromFormat
+  #define PyBytes_DecodeEscape         PyString_DecodeEscape
+  #define PyBytes_AsString             PyString_AsString
+  #define PyBytes_AsStringAndSize      PyString_AsStringAndSize
+  #define PyBytes_Size                 PyString_Size
+  #define PyBytes_AS_STRING            PyString_AS_STRING
+  #define PyBytes_GET_SIZE             PyString_GET_SIZE
+  #define PyBytes_Repr                 PyString_Repr
+  #define PyBytes_Concat               PyString_Concat
+  #define PyBytes_ConcatAndDel         PyString_ConcatAndDel
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj)
+  #define __Pyx_PyBaseString_CheckExact(obj) PyUnicode_CheckExact(obj)
+#else
+  #define __Pyx_PyBaseString_Check(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj) || \
+                                         PyString_Check(obj) || PyUnicode_Check(obj))
+  #define __Pyx_PyBaseString_CheckExact(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj))
+#endif
+#if PY_VERSION_HEX < 0x02060000
+  #define PySet_Check(obj)             PyObject_TypeCheck(obj, &PySet_Type)
+  #define PyFrozenSet_Check(obj)       PyObject_TypeCheck(obj, &PyFrozenSet_Type)
+#endif
+#ifndef PySet_CheckExact
+  #define PySet_CheckExact(obj)        (Py_TYPE(obj) == &PySet_Type)
+#endif
+#define __Pyx_TypeCheck(obj, type) PyObject_TypeCheck(obj, (PyTypeObject *)type)
+#if PY_MAJOR_VERSION >= 3
+  #define PyIntObject                  PyLongObject
+  #define PyInt_Type                   PyLong_Type
+  #define PyInt_Check(op)              PyLong_Check(op)
+  #define PyInt_CheckExact(op)         PyLong_CheckExact(op)
+  #define PyInt_FromString             PyLong_FromString
+  #define PyInt_FromUnicode            PyLong_FromUnicode
+  #define PyInt_FromLong               PyLong_FromLong
+  #define PyInt_FromSize_t             PyLong_FromSize_t
+  #define PyInt_FromSsize_t            PyLong_FromSsize_t
+  #define PyInt_AsLong                 PyLong_AsLong
+  #define PyInt_AS_LONG                PyLong_AS_LONG
+  #define PyInt_AsSsize_t              PyLong_AsSsize_t
+  #define PyInt_AsUnsignedLongMask     PyLong_AsUnsignedLongMask
+  #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask
+  #define PyNumber_Int                 PyNumber_Long
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyBoolObject                 PyLongObject
+#endif
+#if PY_VERSION_HEX < 0x030200A4
+  typedef long Py_hash_t;
+  #define __Pyx_PyInt_FromHash_t PyInt_FromLong
+  #define __Pyx_PyInt_AsHash_t   PyInt_AsLong
+#else
+  #define __Pyx_PyInt_FromHash_t PyInt_FromSsize_t
+  #define __Pyx_PyInt_AsHash_t   PyInt_AsSsize_t
+#endif
+#if (PY_MAJOR_VERSION < 3) || (PY_VERSION_HEX >= 0x03010300)
+  #define __Pyx_PySequence_GetSlice(obj, a, b) PySequence_GetSlice(obj, a, b)
+  #define __Pyx_PySequence_SetSlice(obj, a, b, value) PySequence_SetSlice(obj, a, b, value)
+  #define __Pyx_PySequence_DelSlice(obj, a, b) PySequence_DelSlice(obj, a, b)
+#else
+  #define __Pyx_PySequence_GetSlice(obj, a, b) (unlikely(!(obj)) ? \
+        (PyErr_SetString(PyExc_SystemError, "null argument to internal routine"), (PyObject*)0) : \
+        (likely((obj)->ob_type->tp_as_mapping) ? (PySequence_GetSlice(obj, a, b)) : \
+            (PyErr_Format(PyExc_TypeError, "'%.200s' object is unsliceable", (obj)->ob_type->tp_name), (PyObject*)0)))
+  #define __Pyx_PySequence_SetSlice(obj, a, b, value) (unlikely(!(obj)) ? \
+        (PyErr_SetString(PyExc_SystemError, "null argument to internal routine"), -1) : \
+        (likely((obj)->ob_type->tp_as_mapping) ? (PySequence_SetSlice(obj, a, b, value)) : \
+            (PyErr_Format(PyExc_TypeError, "'%.200s' object doesn't support slice assignment", (obj)->ob_type->tp_name), -1)))
+  #define __Pyx_PySequence_DelSlice(obj, a, b) (unlikely(!(obj)) ? \
+        (PyErr_SetString(PyExc_SystemError, "null argument to internal routine"), -1) : \
+        (likely((obj)->ob_type->tp_as_mapping) ? (PySequence_DelSlice(obj, a, b)) : \
+            (PyErr_Format(PyExc_TypeError, "'%.200s' object doesn't support slice deletion", (obj)->ob_type->tp_name), -1)))
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyMethod_New(func, self, klass) ((self) ? PyMethod_New(func, self) : PyInstanceMethod_New(func))
+#endif
+#if PY_VERSION_HEX < 0x02050000
+  #define __Pyx_GetAttrString(o,n)   PyObject_GetAttrString((o),((char *)(n)))
+  #define __Pyx_SetAttrString(o,n,a) PyObject_SetAttrString((o),((char *)(n)),(a))
+  #define __Pyx_DelAttrString(o,n)   PyObject_DelAttrString((o),((char *)(n)))
+#else
+  #define __Pyx_GetAttrString(o,n)   PyObject_GetAttrString((o),(n))
+  #define __Pyx_SetAttrString(o,n,a) PyObject_SetAttrString((o),(n),(a))
+  #define __Pyx_DelAttrString(o,n)   PyObject_DelAttrString((o),(n))
+#endif
+#if PY_VERSION_HEX < 0x02050000
+  #define __Pyx_NAMESTR(n) ((char *)(n))
+  #define __Pyx_DOCSTR(n)  ((char *)(n))
+#else
+  #define __Pyx_NAMESTR(n) (n)
+  #define __Pyx_DOCSTR(n)  (n)
+#endif
+#ifndef CYTHON_INLINE
+  #if defined(__GNUC__)
+    #define CYTHON_INLINE __inline__
+  #elif defined(_MSC_VER)
+    #define CYTHON_INLINE __inline
+  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define CYTHON_INLINE inline
+  #else
+    #define CYTHON_INLINE
+  #endif
+#endif
+#ifndef CYTHON_RESTRICT
+  #if defined(__GNUC__)
+    #define CYTHON_RESTRICT __restrict__
+  #elif defined(_MSC_VER) && _MSC_VER >= 1400
+    #define CYTHON_RESTRICT __restrict
+  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define CYTHON_RESTRICT restrict
+  #else
+    #define CYTHON_RESTRICT
+  #endif
+#endif
+#ifdef NAN
+#define __PYX_NAN() ((float) NAN)
+#else
+static CYTHON_INLINE float __PYX_NAN() {
+  /* Initialize NaN. The sign is irrelevant, an exponent with all bits 1 and
+   a nonzero mantissa means NaN. If the first bit in the mantissa is 1, it is
+   a quiet NaN. */
+  float value;
+  memset(&value, 0xFF, sizeof(value));
+  return value;
+}
+#endif
+
+
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_TrueDivide(x,y)
+  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceTrueDivide(x,y)
+#else
+  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_Divide(x,y)
+  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceDivide(x,y)
+#endif
+
+#ifndef __PYX_EXTERN_C
+  #ifdef __cplusplus
+    #define __PYX_EXTERN_C extern "C"
+  #else
+    #define __PYX_EXTERN_C extern
+  #endif
+#endif
+
+#if defined(WIN32) || defined(MS_WINDOWS)
+#define _USE_MATH_DEFINES
+#endif
+#include <math.h>
+#define __PYX_HAVE__nms__cpu_nms
+#define __PYX_HAVE_API__nms__cpu_nms
+#include "string.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif /* _OPENMP */
+
+#ifdef PYREX_WITHOUT_ASSERTIONS
+#define CYTHON_WITHOUT_ASSERTIONS
+#endif
+
+#ifndef CYTHON_UNUSED
+# if defined(__GNUC__)
+#   if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+#     define CYTHON_UNUSED __attribute__ ((__unused__))
+#   else
+#     define CYTHON_UNUSED
+#   endif
+# elif defined(__ICC) || (defined(__INTEL_COMPILER) && !defined(_MSC_VER))
+#   define CYTHON_UNUSED __attribute__ ((__unused__))
+# else
+#   define CYTHON_UNUSED
+# endif
+#endif
+typedef struct {PyObject **p; char *s; const Py_ssize_t n; const char* encoding;
+                const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry; /*proto*/
+
+#define __PYX_DEFAULT_STRING_ENCODING_IS_ASCII 0
+#define __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT 0
+#define __PYX_DEFAULT_STRING_ENCODING ""
+#define __Pyx_PyObject_FromString __Pyx_PyBytes_FromString
+#define __Pyx_PyObject_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#define __Pyx_fits_Py_ssize_t(v, type, is_signed)  (    \
+    (sizeof(type) < sizeof(Py_ssize_t))  ||             \
+    (sizeof(type) > sizeof(Py_ssize_t) &&               \
+          likely(v < (type)PY_SSIZE_T_MAX ||            \
+                 v == (type)PY_SSIZE_T_MAX)  &&         \
+          (!is_signed || likely(v > (type)PY_SSIZE_T_MIN ||       \
+                                v == (type)PY_SSIZE_T_MIN)))  ||  \
+    (sizeof(type) == sizeof(Py_ssize_t) &&              \
+          (is_signed || likely(v < (type)PY_SSIZE_T_MAX ||        \
+                               v == (type)PY_SSIZE_T_MAX)))  )
+static CYTHON_INLINE char* __Pyx_PyObject_AsString(PyObject*);
+static CYTHON_INLINE char* __Pyx_PyObject_AsStringAndSize(PyObject*, Py_ssize_t* length);
+#define __Pyx_PyByteArray_FromString(s) PyByteArray_FromStringAndSize((const char*)s, strlen((const char*)s))
+#define __Pyx_PyByteArray_FromStringAndSize(s, l) PyByteArray_FromStringAndSize((const char*)s, l)
+#define __Pyx_PyBytes_FromString        PyBytes_FromString
+#define __Pyx_PyBytes_FromStringAndSize PyBytes_FromStringAndSize
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(char*);
+#if PY_MAJOR_VERSION < 3
+    #define __Pyx_PyStr_FromString        __Pyx_PyBytes_FromString
+    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#else
+    #define __Pyx_PyStr_FromString        __Pyx_PyUnicode_FromString
+    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyUnicode_FromStringAndSize
+#endif
+#define __Pyx_PyObject_AsSString(s)    ((signed char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsUString(s)    ((unsigned char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_FromUString(s)  __Pyx_PyObject_FromString((char*)s)
+#define __Pyx_PyBytes_FromUString(s)   __Pyx_PyBytes_FromString((char*)s)
+#define __Pyx_PyByteArray_FromUString(s)   __Pyx_PyByteArray_FromString((char*)s)
+#define __Pyx_PyStr_FromUString(s)     __Pyx_PyStr_FromString((char*)s)
+#define __Pyx_PyUnicode_FromUString(s) __Pyx_PyUnicode_FromString((char*)s)
+#if PY_MAJOR_VERSION < 3
+static CYTHON_INLINE size_t __Pyx_Py_UNICODE_strlen(const Py_UNICODE *u)
+{
+    const Py_UNICODE *u_end = u;
+    while (*u_end++) ;
+    return u_end - u - 1;
+}
+#else
+#define __Pyx_Py_UNICODE_strlen Py_UNICODE_strlen
+#endif
+#define __Pyx_PyUnicode_FromUnicode(u)       PyUnicode_FromUnicode(u, __Pyx_Py_UNICODE_strlen(u))
+#define __Pyx_PyUnicode_FromUnicodeAndLength PyUnicode_FromUnicode
+#define __Pyx_PyUnicode_AsUnicode            PyUnicode_AsUnicode
+#define __Pyx_Owned_Py_None(b) (Py_INCREF(Py_None), Py_None)
+#define __Pyx_PyBool_FromLong(b) ((b) ? (Py_INCREF(Py_True), Py_True) : (Py_INCREF(Py_False), Py_False))
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*);
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_Int(PyObject* x);
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject*);
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t);
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __pyx_PyFloat_AsDouble(x) (PyFloat_CheckExact(x) ? PyFloat_AS_DOUBLE(x) : PyFloat_AsDouble(x))
+#else
+#define __pyx_PyFloat_AsDouble(x) PyFloat_AsDouble(x)
+#endif
+#define __pyx_PyFloat_AsFloat(x) ((float) __pyx_PyFloat_AsDouble(x))
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+static int __Pyx_sys_getdefaultencoding_not_ascii;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+    PyObject* sys = NULL;
+    PyObject* default_encoding = NULL;
+    PyObject* ascii_chars_u = NULL;
+    PyObject* ascii_chars_b = NULL;
+    sys = PyImport_ImportModule("sys");
+    if (sys == NULL) goto bad;
+    default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL);
+    if (default_encoding == NULL) goto bad;
+    if (strcmp(PyBytes_AsString(default_encoding), "ascii") == 0) {
+        __Pyx_sys_getdefaultencoding_not_ascii = 0;
+    } else {
+        const char* default_encoding_c = PyBytes_AS_STRING(default_encoding);
+        char ascii_chars[128];
+        int c;
+        for (c = 0; c < 128; c++) {
+            ascii_chars[c] = c;
+        }
+        __Pyx_sys_getdefaultencoding_not_ascii = 1;
+        ascii_chars_u = PyUnicode_DecodeASCII(ascii_chars, 128, NULL);
+        if (ascii_chars_u == NULL) goto bad;
+        ascii_chars_b = PyUnicode_AsEncodedString(ascii_chars_u, default_encoding_c, NULL);
+        if (ascii_chars_b == NULL || strncmp(ascii_chars, PyBytes_AS_STRING(ascii_chars_b), 128) != 0) {
+            PyErr_Format(
+                PyExc_ValueError,
+                "This module compiled with c_string_encoding=ascii, but default encoding '%.200s' is not a superset of ascii.",
+                default_encoding_c);
+            goto bad;
+        }
+    }
+    Py_XDECREF(sys);
+    Py_XDECREF(default_encoding);
+    Py_XDECREF(ascii_chars_u);
+    Py_XDECREF(ascii_chars_b);
+    return 0;
+bad:
+    Py_XDECREF(sys);
+    Py_XDECREF(default_encoding);
+    Py_XDECREF(ascii_chars_u);
+    Py_XDECREF(ascii_chars_b);
+    return -1;
+}
+#endif
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT && PY_MAJOR_VERSION >= 3
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_DecodeUTF8(c_str, size, NULL)
+#else
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_Decode(c_str, size, __PYX_DEFAULT_STRING_ENCODING, NULL)
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+static char* __PYX_DEFAULT_STRING_ENCODING;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+    PyObject* sys = NULL;
+    PyObject* default_encoding = NULL;
+    char* default_encoding_c;
+    sys = PyImport_ImportModule("sys");
+    if (sys == NULL) goto bad;
+    default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL);
+    if (default_encoding == NULL) goto bad;
+    default_encoding_c = PyBytes_AS_STRING(default_encoding);
+    __PYX_DEFAULT_STRING_ENCODING = (char*) malloc(strlen(default_encoding_c));
+    strcpy(__PYX_DEFAULT_STRING_ENCODING, default_encoding_c);
+    Py_DECREF(sys);
+    Py_DECREF(default_encoding);
+    return 0;
+bad:
+    Py_XDECREF(sys);
+    Py_XDECREF(default_encoding);
+    return -1;
+}
+#endif
+#endif
+
+
+#ifdef __GNUC__
+  /* Test for GCC > 2.95 */
+  #if __GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95))
+    #define likely(x)   __builtin_expect(!!(x), 1)
+    #define unlikely(x) __builtin_expect(!!(x), 0)
+  #else /* __GNUC__ > 2 ... */
+    #define likely(x)   (x)
+    #define unlikely(x) (x)
+  #endif /* __GNUC__ > 2 ... */
+#else /* __GNUC__ */
+  #define likely(x)   (x)
+  #define unlikely(x) (x)
+#endif /* __GNUC__ */
+
+static PyObject *__pyx_m;
+static PyObject *__pyx_d;
+static PyObject *__pyx_b;
+static PyObject *__pyx_empty_tuple;
+static PyObject *__pyx_empty_bytes;
+static int __pyx_lineno;
+static int __pyx_clineno = 0;
+static const char * __pyx_cfilenm= __FILE__;
+static const char *__pyx_filename;
+
+#if !defined(CYTHON_CCOMPLEX)
+  #if defined(__cplusplus)
+    #define CYTHON_CCOMPLEX 1
+  #elif defined(_Complex_I)
+    #define CYTHON_CCOMPLEX 1
+  #else
+    #define CYTHON_CCOMPLEX 0
+  #endif
+#endif
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    #include <complex>
+  #else
+    #include <complex.h>
+  #endif
+#endif
+#if CYTHON_CCOMPLEX && !defined(__cplusplus) && defined(__sun__) && defined(__GNUC__)
+  #undef _Complex_I
+  #define _Complex_I 1.0fj
+#endif
+
+
+static const char *__pyx_f[] = {
+  "cpu_nms.pyx",
+  "__init__.pxd",
+  "type.pxd",
+};
+#define IS_UNSIGNED(type) (((type) -1) > 0)
+struct __Pyx_StructField_;
+#define __PYX_BUF_FLAGS_PACKED_STRUCT (1 << 0)
+typedef struct {
+  const char* name; /* for error messages only */
+  struct __Pyx_StructField_* fields;
+  size_t size;     /* sizeof(type) */
+  size_t arraysize[8]; /* length of array in each dimension */
+  int ndim;
+  char typegroup; /* _R_eal, _C_omplex, Signed _I_nt, _U_nsigned int, _S_truct, _P_ointer, _O_bject, c_H_ar */
+  char is_unsigned;
+  int flags;
+} __Pyx_TypeInfo;
+typedef struct __Pyx_StructField_ {
+  __Pyx_TypeInfo* type;
+  const char* name;
+  size_t offset;
+} __Pyx_StructField;
+typedef struct {
+  __Pyx_StructField* field;
+  size_t parent_offset;
+} __Pyx_BufFmt_StackElem;
+typedef struct {
+  __Pyx_StructField root;
+  __Pyx_BufFmt_StackElem* head;
+  size_t fmt_offset;
+  size_t new_count, enc_count;
+  size_t struct_alignment;
+  int is_complex;
+  char enc_type;
+  char new_packmode;
+  char enc_packmode;
+  char is_valid_array;
+} __Pyx_BufFmt_Context;
+
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":723
+ * # in Cython to enable them only on the right systems.
+ * 
+ * ctypedef npy_int8       int8_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t
+ */
+typedef npy_int8 __pyx_t_5numpy_int8_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":724
+ * 
+ * ctypedef npy_int8       int8_t
+ * ctypedef npy_int16      int16_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int32      int32_t
+ * ctypedef npy_int64      int64_t
+ */
+typedef npy_int16 __pyx_t_5numpy_int16_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":725
+ * ctypedef npy_int8       int8_t
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int64      int64_t
+ * #ctypedef npy_int96      int96_t
+ */
+typedef npy_int32 __pyx_t_5numpy_int32_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":726
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t
+ * ctypedef npy_int64      int64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_int96      int96_t
+ * #ctypedef npy_int128     int128_t
+ */
+typedef npy_int64 __pyx_t_5numpy_int64_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":730
+ * #ctypedef npy_int128     int128_t
+ * 
+ * ctypedef npy_uint8      uint8_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t
+ */
+typedef npy_uint8 __pyx_t_5numpy_uint8_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":731
+ * 
+ * ctypedef npy_uint8      uint8_t
+ * ctypedef npy_uint16     uint16_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint32     uint32_t
+ * ctypedef npy_uint64     uint64_t
+ */
+typedef npy_uint16 __pyx_t_5numpy_uint16_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":732
+ * ctypedef npy_uint8      uint8_t
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint64     uint64_t
+ * #ctypedef npy_uint96     uint96_t
+ */
+typedef npy_uint32 __pyx_t_5numpy_uint32_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":733
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t
+ * ctypedef npy_uint64     uint64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_uint96     uint96_t
+ * #ctypedef npy_uint128    uint128_t
+ */
+typedef npy_uint64 __pyx_t_5numpy_uint64_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":737
+ * #ctypedef npy_uint128    uint128_t
+ * 
+ * ctypedef npy_float32    float32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_float64    float64_t
+ * #ctypedef npy_float80    float80_t
+ */
+typedef npy_float32 __pyx_t_5numpy_float32_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":738
+ * 
+ * ctypedef npy_float32    float32_t
+ * ctypedef npy_float64    float64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_float80    float80_t
+ * #ctypedef npy_float128   float128_t
+ */
+typedef npy_float64 __pyx_t_5numpy_float64_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":747
+ * # The int types are mapped a bit surprising --
+ * # numpy.int corresponds to 'l' and numpy.long to 'q'
+ * ctypedef npy_long       int_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longlong   long_t
+ * ctypedef npy_longlong   longlong_t
+ */
+typedef npy_long __pyx_t_5numpy_int_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":748
+ * # numpy.int corresponds to 'l' and numpy.long to 'q'
+ * ctypedef npy_long       int_t
+ * ctypedef npy_longlong   long_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longlong   longlong_t
+ * 
+ */
+typedef npy_longlong __pyx_t_5numpy_long_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":749
+ * ctypedef npy_long       int_t
+ * ctypedef npy_longlong   long_t
+ * ctypedef npy_longlong   longlong_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_ulong      uint_t
+ */
+typedef npy_longlong __pyx_t_5numpy_longlong_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":751
+ * ctypedef npy_longlong   longlong_t
+ * 
+ * ctypedef npy_ulong      uint_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_ulonglong  ulong_t
+ * ctypedef npy_ulonglong  ulonglong_t
+ */
+typedef npy_ulong __pyx_t_5numpy_uint_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":752
+ * 
+ * ctypedef npy_ulong      uint_t
+ * ctypedef npy_ulonglong  ulong_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_ulonglong  ulonglong_t
+ * 
+ */
+typedef npy_ulonglong __pyx_t_5numpy_ulong_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":753
+ * ctypedef npy_ulong      uint_t
+ * ctypedef npy_ulonglong  ulong_t
+ * ctypedef npy_ulonglong  ulonglong_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_intp       intp_t
+ */
+typedef npy_ulonglong __pyx_t_5numpy_ulonglong_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":755
+ * ctypedef npy_ulonglong  ulonglong_t
+ * 
+ * ctypedef npy_intp       intp_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uintp      uintp_t
+ * 
+ */
+typedef npy_intp __pyx_t_5numpy_intp_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":756
+ * 
+ * ctypedef npy_intp       intp_t
+ * ctypedef npy_uintp      uintp_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_double     float_t
+ */
+typedef npy_uintp __pyx_t_5numpy_uintp_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":758
+ * ctypedef npy_uintp      uintp_t
+ * 
+ * ctypedef npy_double     float_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_double     double_t
+ * ctypedef npy_longdouble longdouble_t
+ */
+typedef npy_double __pyx_t_5numpy_float_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":759
+ * 
+ * ctypedef npy_double     float_t
+ * ctypedef npy_double     double_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longdouble longdouble_t
+ * 
+ */
+typedef npy_double __pyx_t_5numpy_double_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":760
+ * ctypedef npy_double     float_t
+ * ctypedef npy_double     double_t
+ * ctypedef npy_longdouble longdouble_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_cfloat      cfloat_t
+ */
+typedef npy_longdouble __pyx_t_5numpy_longdouble_t;
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    typedef ::std::complex< float > __pyx_t_float_complex;
+  #else
+    typedef float _Complex __pyx_t_float_complex;
+  #endif
+#else
+    typedef struct { float real, imag; } __pyx_t_float_complex;
+#endif
+
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    typedef ::std::complex< double > __pyx_t_double_complex;
+  #else
+    typedef double _Complex __pyx_t_double_complex;
+  #endif
+#else
+    typedef struct { double real, imag; } __pyx_t_double_complex;
+#endif
+
+
+/*--- Type declarations ---*/
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":762
+ * ctypedef npy_longdouble longdouble_t
+ * 
+ * ctypedef npy_cfloat      cfloat_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_cdouble     cdouble_t
+ * ctypedef npy_clongdouble clongdouble_t
+ */
+typedef npy_cfloat __pyx_t_5numpy_cfloat_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":763
+ * 
+ * ctypedef npy_cfloat      cfloat_t
+ * ctypedef npy_cdouble     cdouble_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_clongdouble clongdouble_t
+ * 
+ */
+typedef npy_cdouble __pyx_t_5numpy_cdouble_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":764
+ * ctypedef npy_cfloat      cfloat_t
+ * ctypedef npy_cdouble     cdouble_t
+ * ctypedef npy_clongdouble clongdouble_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_cdouble     complex_t
+ */
+typedef npy_clongdouble __pyx_t_5numpy_clongdouble_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":766
+ * ctypedef npy_clongdouble clongdouble_t
+ * 
+ * ctypedef npy_cdouble     complex_t             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):
+ */
+typedef npy_cdouble __pyx_t_5numpy_complex_t;
+#ifndef CYTHON_REFNANNY
+  #define CYTHON_REFNANNY 0
+#endif
+#if CYTHON_REFNANNY
+  typedef struct {
+    void (*INCREF)(void*, PyObject*, int);
+    void (*DECREF)(void*, PyObject*, int);
+    void (*GOTREF)(void*, PyObject*, int);
+    void (*GIVEREF)(void*, PyObject*, int);
+    void* (*SetupContext)(const char*, int, const char*);
+    void (*FinishContext)(void**);
+  } __Pyx_RefNannyAPIStruct;
+  static __Pyx_RefNannyAPIStruct *__Pyx_RefNanny = NULL;
+  static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname); /*proto*/
+  #define __Pyx_RefNannyDeclarations void *__pyx_refnanny = NULL;
+#ifdef WITH_THREAD
+  #define __Pyx_RefNannySetupContext(name, acquire_gil) \
+          if (acquire_gil) { \
+              PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure(); \
+              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__); \
+              PyGILState_Release(__pyx_gilstate_save); \
+          } else { \
+              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__); \
+          }
+#else
+  #define __Pyx_RefNannySetupContext(name, acquire_gil) \
+          __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__)
+#endif
+  #define __Pyx_RefNannyFinishContext() \
+          __Pyx_RefNanny->FinishContext(&__pyx_refnanny)
+  #define __Pyx_INCREF(r)  __Pyx_RefNanny->INCREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_DECREF(r)  __Pyx_RefNanny->DECREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_GOTREF(r)  __Pyx_RefNanny->GOTREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_GIVEREF(r) __Pyx_RefNanny->GIVEREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_XINCREF(r)  do { if((r) != NULL) {__Pyx_INCREF(r); }} while(0)
+  #define __Pyx_XDECREF(r)  do { if((r) != NULL) {__Pyx_DECREF(r); }} while(0)
+  #define __Pyx_XGOTREF(r)  do { if((r) != NULL) {__Pyx_GOTREF(r); }} while(0)
+  #define __Pyx_XGIVEREF(r) do { if((r) != NULL) {__Pyx_GIVEREF(r);}} while(0)
+#else
+  #define __Pyx_RefNannyDeclarations
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)
+  #define __Pyx_RefNannyFinishContext()
+  #define __Pyx_INCREF(r) Py_INCREF(r)
+  #define __Pyx_DECREF(r) Py_DECREF(r)
+  #define __Pyx_GOTREF(r)
+  #define __Pyx_GIVEREF(r)
+  #define __Pyx_XINCREF(r) Py_XINCREF(r)
+  #define __Pyx_XDECREF(r) Py_XDECREF(r)
+  #define __Pyx_XGOTREF(r)
+  #define __Pyx_XGIVEREF(r)
+#endif /* CYTHON_REFNANNY */
+#define __Pyx_XDECREF_SET(r, v) do {                            \
+        PyObject *tmp = (PyObject *) r;                         \
+        r = v; __Pyx_XDECREF(tmp);                              \
+    } while (0)
+#define __Pyx_DECREF_SET(r, v) do {                             \
+        PyObject *tmp = (PyObject *) r;                         \
+        r = v; __Pyx_DECREF(tmp);                               \
+    } while (0)
+#define __Pyx_CLEAR(r)    do { PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);} while(0)
+#define __Pyx_XCLEAR(r)   do { if((r) != NULL) {PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);}} while(0)
+
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name) {
+    PyTypeObject* tp = Py_TYPE(obj);
+    if (likely(tp->tp_getattro))
+        return tp->tp_getattro(obj, attr_name);
+#if PY_MAJOR_VERSION < 3
+    if (likely(tp->tp_getattr))
+        return tp->tp_getattr(obj, PyString_AS_STRING(attr_name));
+#endif
+    return PyObject_GetAttr(obj, attr_name);
+}
+#else
+#define __Pyx_PyObject_GetAttrStr(o,n) PyObject_GetAttr(o,n)
+#endif
+
+static PyObject *__Pyx_GetBuiltinName(PyObject *name); /*proto*/
+
+static void __Pyx_RaiseArgtupleInvalid(const char* func_name, int exact,
+    Py_ssize_t num_min, Py_ssize_t num_max, Py_ssize_t num_found); /*proto*/
+
+static void __Pyx_RaiseDoubleKeywordsError(const char* func_name, PyObject* kw_name); /*proto*/
+
+static int __Pyx_ParseOptionalKeywords(PyObject *kwds, PyObject **argnames[], \
+    PyObject *kwds2, PyObject *values[], Py_ssize_t num_pos_args, \
+    const char* function_name); /*proto*/
+
+static CYTHON_INLINE int __Pyx_ArgTypeTest(PyObject *obj, PyTypeObject *type, int none_allowed,
+    const char *name, int exact); /*proto*/
+
+static CYTHON_INLINE int  __Pyx_GetBufferAndValidate(Py_buffer* buf, PyObject* obj,
+    __Pyx_TypeInfo* dtype, int flags, int nd, int cast, __Pyx_BufFmt_StackElem* stack);
+static CYTHON_INLINE void __Pyx_SafeReleaseBuffer(Py_buffer* info);
+
+static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type); /*proto*/
+
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw); /*proto*/
+#else
+#define __Pyx_PyObject_Call(func, arg, kw) PyObject_Call(func, arg, kw)
+#endif
+
+static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name); /*proto*/
+
+static void __Pyx_RaiseBufferIndexError(int axis); /*proto*/
+
+#define __Pyx_BufPtrStrided1d(type, buf, i0, s0) (type)((char*)buf + i0 * s0)
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE int __Pyx_PyList_Append(PyObject* list, PyObject* x) {
+    PyListObject* L = (PyListObject*) list;
+    Py_ssize_t len = Py_SIZE(list);
+    if (likely(L->allocated > len) & likely(len > (L->allocated >> 1))) {
+        Py_INCREF(x);
+        PyList_SET_ITEM(list, len, x);
+        Py_SIZE(list) = len+1;
+        return 0;
+    }
+    return PyList_Append(list, x);
+}
+#else
+#define __Pyx_PyList_Append(L,x) PyList_Append(L,x)
+#endif
+
+#ifndef __PYX_FORCE_INIT_THREADS
+  #define __PYX_FORCE_INIT_THREADS 0
+#endif
+
+static CYTHON_INLINE void __Pyx_ErrRestore(PyObject *type, PyObject *value, PyObject *tb); /*proto*/
+static CYTHON_INLINE void __Pyx_ErrFetch(PyObject **type, PyObject **value, PyObject **tb); /*proto*/
+
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause); /*proto*/
+
+static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected);
+
+static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index);
+
+static CYTHON_INLINE void __Pyx_RaiseNoneNotIterableError(void);
+
+typedef struct {
+  Py_ssize_t shape, strides, suboffsets;
+} __Pyx_Buf_DimInfo;
+typedef struct {
+  size_t refcount;
+  Py_buffer pybuffer;
+} __Pyx_Buffer;
+typedef struct {
+  __Pyx_Buffer *rcbuffer;
+  char *data;
+  __Pyx_Buf_DimInfo diminfo[8];
+} __Pyx_LocalBuf_ND;
+
+#if PY_MAJOR_VERSION < 3
+    static int __Pyx_GetBuffer(PyObject *obj, Py_buffer *view, int flags);
+    static void __Pyx_ReleaseBuffer(Py_buffer *view);
+#else
+    #define __Pyx_GetBuffer PyObject_GetBuffer
+    #define __Pyx_ReleaseBuffer PyBuffer_Release
+#endif
+
+
+static Py_ssize_t __Pyx_zeros[] = {0, 0, 0, 0, 0, 0, 0, 0};
+static Py_ssize_t __Pyx_minusones[] = {-1, -1, -1, -1, -1, -1, -1, -1};
+
+static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level); /*proto*/
+
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value);
+
+static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *);
+
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value);
+
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    #define __Pyx_CREAL(z) ((z).real())
+    #define __Pyx_CIMAG(z) ((z).imag())
+  #else
+    #define __Pyx_CREAL(z) (__real__(z))
+    #define __Pyx_CIMAG(z) (__imag__(z))
+  #endif
+#else
+    #define __Pyx_CREAL(z) ((z).real)
+    #define __Pyx_CIMAG(z) ((z).imag)
+#endif
+#if (defined(_WIN32) || defined(__clang__)) && defined(__cplusplus) && CYTHON_CCOMPLEX
+    #define __Pyx_SET_CREAL(z,x) ((z).real(x))
+    #define __Pyx_SET_CIMAG(z,y) ((z).imag(y))
+#else
+    #define __Pyx_SET_CREAL(z,x) __Pyx_CREAL(z) = (x)
+    #define __Pyx_SET_CIMAG(z,y) __Pyx_CIMAG(z) = (y)
+#endif
+
+static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float, float);
+
+#if CYTHON_CCOMPLEX
+    #define __Pyx_c_eqf(a, b)   ((a)==(b))
+    #define __Pyx_c_sumf(a, b)  ((a)+(b))
+    #define __Pyx_c_difff(a, b) ((a)-(b))
+    #define __Pyx_c_prodf(a, b) ((a)*(b))
+    #define __Pyx_c_quotf(a, b) ((a)/(b))
+    #define __Pyx_c_negf(a)     (-(a))
+  #ifdef __cplusplus
+    #define __Pyx_c_is_zerof(z) ((z)==(float)0)
+    #define __Pyx_c_conjf(z)    (::std::conj(z))
+    #if 1
+        #define __Pyx_c_absf(z)     (::std::abs(z))
+        #define __Pyx_c_powf(a, b)  (::std::pow(a, b))
+    #endif
+  #else
+    #define __Pyx_c_is_zerof(z) ((z)==0)
+    #define __Pyx_c_conjf(z)    (conjf(z))
+    #if 1
+        #define __Pyx_c_absf(z)     (cabsf(z))
+        #define __Pyx_c_powf(a, b)  (cpowf(a, b))
+    #endif
+ #endif
+#else
+    static CYTHON_INLINE int __Pyx_c_eqf(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_sumf(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_difff(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_prodf(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quotf(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_negf(__pyx_t_float_complex);
+    static CYTHON_INLINE int __Pyx_c_is_zerof(__pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_conjf(__pyx_t_float_complex);
+    #if 1
+        static CYTHON_INLINE float __Pyx_c_absf(__pyx_t_float_complex);
+        static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_powf(__pyx_t_float_complex, __pyx_t_float_complex);
+    #endif
+#endif
+
+static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double, double);
+
+#if CYTHON_CCOMPLEX
+    #define __Pyx_c_eq(a, b)   ((a)==(b))
+    #define __Pyx_c_sum(a, b)  ((a)+(b))
+    #define __Pyx_c_diff(a, b) ((a)-(b))
+    #define __Pyx_c_prod(a, b) ((a)*(b))
+    #define __Pyx_c_quot(a, b) ((a)/(b))
+    #define __Pyx_c_neg(a)     (-(a))
+  #ifdef __cplusplus
+    #define __Pyx_c_is_zero(z) ((z)==(double)0)
+    #define __Pyx_c_conj(z)    (::std::conj(z))
+    #if 1
+        #define __Pyx_c_abs(z)     (::std::abs(z))
+        #define __Pyx_c_pow(a, b)  (::std::pow(a, b))
+    #endif
+  #else
+    #define __Pyx_c_is_zero(z) ((z)==0)
+    #define __Pyx_c_conj(z)    (conj(z))
+    #if 1
+        #define __Pyx_c_abs(z)     (cabs(z))
+        #define __Pyx_c_pow(a, b)  (cpow(a, b))
+    #endif
+ #endif
+#else
+    static CYTHON_INLINE int __Pyx_c_eq(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_sum(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_diff(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_prod(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_neg(__pyx_t_double_complex);
+    static CYTHON_INLINE int __Pyx_c_is_zero(__pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_conj(__pyx_t_double_complex);
+    #if 1
+        static CYTHON_INLINE double __Pyx_c_abs(__pyx_t_double_complex);
+        static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_pow(__pyx_t_double_complex, __pyx_t_double_complex);
+    #endif
+#endif
+
+static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *);
+
+static int __Pyx_check_binary_version(void);
+
+#if !defined(__Pyx_PyIdentifier_FromString)
+#if PY_MAJOR_VERSION < 3
+  #define __Pyx_PyIdentifier_FromString(s) PyString_FromString(s)
+#else
+  #define __Pyx_PyIdentifier_FromString(s) PyUnicode_FromString(s)
+#endif
+#endif
+
+static PyObject *__Pyx_ImportModule(const char *name); /*proto*/
+
+static PyTypeObject *__Pyx_ImportType(const char *module_name, const char *class_name, size_t size, int strict);  /*proto*/
+
+typedef struct {
+    int code_line;
+    PyCodeObject* code_object;
+} __Pyx_CodeObjectCacheEntry;
+struct __Pyx_CodeObjectCache {
+    int count;
+    int max_count;
+    __Pyx_CodeObjectCacheEntry* entries;
+};
+static struct __Pyx_CodeObjectCache __pyx_code_cache = {0,0,NULL};
+static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line);
+static PyCodeObject *__pyx_find_code_object(int code_line);
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object);
+
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename); /*proto*/
+
+static int __Pyx_InitStrings(__Pyx_StringTabEntry *t); /*proto*/
+
+
+/* Module declarations from 'cpython.buffer' */
+
+/* Module declarations from 'cpython.ref' */
+
+/* Module declarations from 'libc.string' */
+
+/* Module declarations from 'libc.stdio' */
+
+/* Module declarations from 'cpython.object' */
+
+/* Module declarations from '__builtin__' */
+
+/* Module declarations from 'cpython.type' */
+static PyTypeObject *__pyx_ptype_7cpython_4type_type = 0;
+
+/* Module declarations from 'libc.stdlib' */
+
+/* Module declarations from 'numpy' */
+
+/* Module declarations from 'numpy' */
+static PyTypeObject *__pyx_ptype_5numpy_dtype = 0;
+static PyTypeObject *__pyx_ptype_5numpy_flatiter = 0;
+static PyTypeObject *__pyx_ptype_5numpy_broadcast = 0;
+static PyTypeObject *__pyx_ptype_5numpy_ndarray = 0;
+static PyTypeObject *__pyx_ptype_5numpy_ufunc = 0;
+static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *, char *, char *, int *); /*proto*/
+
+/* Module declarations from 'nms.cpu_nms' */
+static CYTHON_INLINE __pyx_t_5numpy_float32_t __pyx_f_3nms_7cpu_nms_max(__pyx_t_5numpy_float32_t, __pyx_t_5numpy_float32_t); /*proto*/
+static CYTHON_INLINE __pyx_t_5numpy_float32_t __pyx_f_3nms_7cpu_nms_min(__pyx_t_5numpy_float32_t, __pyx_t_5numpy_float32_t); /*proto*/
+static __Pyx_TypeInfo __Pyx_TypeInfo_nn___pyx_t_5numpy_float32_t = { "float32_t", NULL, sizeof(__pyx_t_5numpy_float32_t), { 0 }, 0, 'R', 0, 0 };
+static __Pyx_TypeInfo __Pyx_TypeInfo_nn___pyx_t_5numpy_int_t = { "int_t", NULL, sizeof(__pyx_t_5numpy_int_t), { 0 }, 0, IS_UNSIGNED(__pyx_t_5numpy_int_t) ? 'U' : 'I', IS_UNSIGNED(__pyx_t_5numpy_int_t), 0 };
+#define __Pyx_MODULE_NAME "nms.cpu_nms"
+int __pyx_module_is_main_nms__cpu_nms = 0;
+
+/* Implementation of 'nms.cpu_nms' */
+static PyObject *__pyx_builtin_range;
+static PyObject *__pyx_builtin_ValueError;
+static PyObject *__pyx_builtin_RuntimeError;
+static PyObject *__pyx_pf_3nms_7cpu_nms_cpu_nms(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_dets, PyObject *__pyx_v_thresh); /* proto */
+static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /* proto */
+static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info); /* proto */
+static char __pyx_k_B[] = "B";
+static char __pyx_k_H[] = "H";
+static char __pyx_k_I[] = "I";
+static char __pyx_k_L[] = "L";
+static char __pyx_k_O[] = "O";
+static char __pyx_k_Q[] = "Q";
+static char __pyx_k_b[] = "b";
+static char __pyx_k_d[] = "d";
+static char __pyx_k_f[] = "f";
+static char __pyx_k_g[] = "g";
+static char __pyx_k_h[] = "h";
+static char __pyx_k_i[] = "i";
+static char __pyx_k_j[] = "_j";
+static char __pyx_k_l[] = "l";
+static char __pyx_k_q[] = "q";
+static char __pyx_k_w[] = "w";
+static char __pyx_k_Zd[] = "Zd";
+static char __pyx_k_Zf[] = "Zf";
+static char __pyx_k_Zg[] = "Zg";
+static char __pyx_k_np[] = "np";
+static char __pyx_k_x1[] = "x1";
+static char __pyx_k_x2[] = "x2";
+static char __pyx_k_y1[] = "y1";
+static char __pyx_k_y2[] = "y2";
+static char __pyx_k_i_2[] = "_i";
+static char __pyx_k_int[] = "int";
+static char __pyx_k_ix1[] = "ix1";
+static char __pyx_k_ix2[] = "ix2";
+static char __pyx_k_iy1[] = "iy1";
+static char __pyx_k_iy2[] = "iy2";
+static char __pyx_k_j_2[] = "j";
+static char __pyx_k_ovr[] = "ovr";
+static char __pyx_k_xx1[] = "xx1";
+static char __pyx_k_xx2[] = "xx2";
+static char __pyx_k_yy1[] = "yy1";
+static char __pyx_k_yy2[] = "yy2";
+static char __pyx_k_dets[] = "dets";
+static char __pyx_k_keep[] = "keep";
+static char __pyx_k_main[] = "__main__";
+static char __pyx_k_test[] = "__test__";
+static char __pyx_k_areas[] = "areas";
+static char __pyx_k_dtype[] = "dtype";
+static char __pyx_k_iarea[] = "iarea";
+static char __pyx_k_inter[] = "inter";
+static char __pyx_k_ndets[] = "ndets";
+static char __pyx_k_numpy[] = "numpy";
+static char __pyx_k_order[] = "order";
+static char __pyx_k_range[] = "range";
+static char __pyx_k_zeros[] = "zeros";
+static char __pyx_k_import[] = "__import__";
+static char __pyx_k_scores[] = "scores";
+static char __pyx_k_thresh[] = "thresh";
+static char __pyx_k_argsort[] = "argsort";
+static char __pyx_k_cpu_nms[] = "cpu_nms";
+static char __pyx_k_ValueError[] = "ValueError";
+static char __pyx_k_suppressed[] = "suppressed";
+static char __pyx_k_nms_cpu_nms[] = "nms.cpu_nms";
+static char __pyx_k_RuntimeError[] = "RuntimeError";
+static char __pyx_k_pyx_getbuffer[] = "__pyx_getbuffer";
+static char __pyx_k_pyx_releasebuffer[] = "__pyx_releasebuffer";
+static char __pyx_k_ndarray_is_not_C_contiguous[] = "ndarray is not C contiguous";
+static char __pyx_k_nfs_yoda_xinleic_Inf_Code_Faste[] = "/nfs.yoda/xinleic/Inf/Code/Faster-RCNN_TF/lib/nms/cpu_nms.pyx";
+static char __pyx_k_unknown_dtype_code_in_numpy_pxd[] = "unknown dtype code in numpy.pxd (%d)";
+static char __pyx_k_Format_string_allocated_too_shor[] = "Format string allocated too short, see comment in numpy.pxd";
+static char __pyx_k_Non_native_byte_order_not_suppor[] = "Non-native byte order not supported";
+static char __pyx_k_ndarray_is_not_Fortran_contiguou[] = "ndarray is not Fortran contiguous";
+static char __pyx_k_Format_string_allocated_too_shor_2[] = "Format string allocated too short.";
+static PyObject *__pyx_kp_u_Format_string_allocated_too_shor;
+static PyObject *__pyx_kp_u_Format_string_allocated_too_shor_2;
+static PyObject *__pyx_kp_u_Non_native_byte_order_not_suppor;
+static PyObject *__pyx_n_s_RuntimeError;
+static PyObject *__pyx_n_s_ValueError;
+static PyObject *__pyx_n_s_areas;
+static PyObject *__pyx_n_s_argsort;
+static PyObject *__pyx_n_s_cpu_nms;
+static PyObject *__pyx_n_s_dets;
+static PyObject *__pyx_n_s_dtype;
+static PyObject *__pyx_n_s_h;
+static PyObject *__pyx_n_s_i;
+static PyObject *__pyx_n_s_i_2;
+static PyObject *__pyx_n_s_iarea;
+static PyObject *__pyx_n_s_import;
+static PyObject *__pyx_n_s_int;
+static PyObject *__pyx_n_s_inter;
+static PyObject *__pyx_n_s_ix1;
+static PyObject *__pyx_n_s_ix2;
+static PyObject *__pyx_n_s_iy1;
+static PyObject *__pyx_n_s_iy2;
+static PyObject *__pyx_n_s_j;
+static PyObject *__pyx_n_s_j_2;
+static PyObject *__pyx_n_s_keep;
+static PyObject *__pyx_n_s_main;
+static PyObject *__pyx_kp_u_ndarray_is_not_C_contiguous;
+static PyObject *__pyx_kp_u_ndarray_is_not_Fortran_contiguou;
+static PyObject *__pyx_n_s_ndets;
+static PyObject *__pyx_kp_s_nfs_yoda_xinleic_Inf_Code_Faste;
+static PyObject *__pyx_n_s_nms_cpu_nms;
+static PyObject *__pyx_n_s_np;
+static PyObject *__pyx_n_s_numpy;
+static PyObject *__pyx_n_s_order;
+static PyObject *__pyx_n_s_ovr;
+static PyObject *__pyx_n_s_pyx_getbuffer;
+static PyObject *__pyx_n_s_pyx_releasebuffer;
+static PyObject *__pyx_n_s_range;
+static PyObject *__pyx_n_s_scores;
+static PyObject *__pyx_n_s_suppressed;
+static PyObject *__pyx_n_s_test;
+static PyObject *__pyx_n_s_thresh;
+static PyObject *__pyx_kp_u_unknown_dtype_code_in_numpy_pxd;
+static PyObject *__pyx_n_s_w;
+static PyObject *__pyx_n_s_x1;
+static PyObject *__pyx_n_s_x2;
+static PyObject *__pyx_n_s_xx1;
+static PyObject *__pyx_n_s_xx2;
+static PyObject *__pyx_n_s_y1;
+static PyObject *__pyx_n_s_y2;
+static PyObject *__pyx_n_s_yy1;
+static PyObject *__pyx_n_s_yy2;
+static PyObject *__pyx_n_s_zeros;
+static PyObject *__pyx_int_0;
+static PyObject *__pyx_int_1;
+static PyObject *__pyx_int_2;
+static PyObject *__pyx_int_3;
+static PyObject *__pyx_int_4;
+static PyObject *__pyx_int_neg_1;
+static PyObject *__pyx_slice_;
+static PyObject *__pyx_slice__3;
+static PyObject *__pyx_slice__5;
+static PyObject *__pyx_slice__7;
+static PyObject *__pyx_slice__9;
+static PyObject *__pyx_tuple__2;
+static PyObject *__pyx_tuple__4;
+static PyObject *__pyx_tuple__6;
+static PyObject *__pyx_tuple__8;
+static PyObject *__pyx_slice__11;
+static PyObject *__pyx_tuple__10;
+static PyObject *__pyx_tuple__12;
+static PyObject *__pyx_tuple__13;
+static PyObject *__pyx_tuple__14;
+static PyObject *__pyx_tuple__15;
+static PyObject *__pyx_tuple__16;
+static PyObject *__pyx_tuple__17;
+static PyObject *__pyx_tuple__18;
+static PyObject *__pyx_codeobj__19;
+
+/* "nms/cpu_nms.pyx":11
+ * cimport numpy as np
+ * 
+ * cdef inline np.float32_t max(np.float32_t a, np.float32_t b):             # <<<<<<<<<<<<<<
+ *     return a if a >= b else b
+ * 
+ */
+
+static CYTHON_INLINE __pyx_t_5numpy_float32_t __pyx_f_3nms_7cpu_nms_max(__pyx_t_5numpy_float32_t __pyx_v_a, __pyx_t_5numpy_float32_t __pyx_v_b) {
+  __pyx_t_5numpy_float32_t __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __pyx_t_5numpy_float32_t __pyx_t_1;
+  __Pyx_RefNannySetupContext("max", 0);
+
+  /* "nms/cpu_nms.pyx":12
+ * 
+ * cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
+ *     return a if a >= b else b             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
+ */
+  if (((__pyx_v_a >= __pyx_v_b) != 0)) {
+    __pyx_t_1 = __pyx_v_a;
+  } else {
+    __pyx_t_1 = __pyx_v_b;
+  }
+  __pyx_r = __pyx_t_1;
+  goto __pyx_L0;
+
+  /* "nms/cpu_nms.pyx":11
+ * cimport numpy as np
+ * 
+ * cdef inline np.float32_t max(np.float32_t a, np.float32_t b):             # <<<<<<<<<<<<<<
+ *     return a if a >= b else b
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "nms/cpu_nms.pyx":14
+ *     return a if a >= b else b
+ * 
+ * cdef inline np.float32_t min(np.float32_t a, np.float32_t b):             # <<<<<<<<<<<<<<
+ *     return a if a <= b else b
+ * 
+ */
+
+static CYTHON_INLINE __pyx_t_5numpy_float32_t __pyx_f_3nms_7cpu_nms_min(__pyx_t_5numpy_float32_t __pyx_v_a, __pyx_t_5numpy_float32_t __pyx_v_b) {
+  __pyx_t_5numpy_float32_t __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __pyx_t_5numpy_float32_t __pyx_t_1;
+  __Pyx_RefNannySetupContext("min", 0);
+
+  /* "nms/cpu_nms.pyx":15
+ * 
+ * cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
+ *     return a if a <= b else b             # <<<<<<<<<<<<<<
+ * 
+ * def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
+ */
+  if (((__pyx_v_a <= __pyx_v_b) != 0)) {
+    __pyx_t_1 = __pyx_v_a;
+  } else {
+    __pyx_t_1 = __pyx_v_b;
+  }
+  __pyx_r = __pyx_t_1;
+  goto __pyx_L0;
+
+  /* "nms/cpu_nms.pyx":14
+ *     return a if a >= b else b
+ * 
+ * cdef inline np.float32_t min(np.float32_t a, np.float32_t b):             # <<<<<<<<<<<<<<
+ *     return a if a <= b else b
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "nms/cpu_nms.pyx":17
+ *     return a if a <= b else b
+ * 
+ * def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
+ *     cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_3nms_7cpu_nms_1cpu_nms(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_3nms_7cpu_nms_1cpu_nms = {__Pyx_NAMESTR("cpu_nms"), (PyCFunction)__pyx_pw_3nms_7cpu_nms_1cpu_nms, METH_VARARGS|METH_KEYWORDS, __Pyx_DOCSTR(0)};
+static PyObject *__pyx_pw_3nms_7cpu_nms_1cpu_nms(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyArrayObject *__pyx_v_dets = 0;
+  PyObject *__pyx_v_thresh = 0;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("cpu_nms (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_dets,&__pyx_n_s_thresh,0};
+    PyObject* values[2] = {0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_dets)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        case  1:
+        if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_thresh)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("cpu_nms", 1, 2, 2, 1); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "cpu_nms") < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+      }
+    } else if (PyTuple_GET_SIZE(__pyx_args) != 2) {
+      goto __pyx_L5_argtuple_error;
+    } else {
+      values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+      values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+    }
+    __pyx_v_dets = ((PyArrayObject *)values[0]);
+    __pyx_v_thresh = ((PyObject*)values[1]);
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("cpu_nms", 1, 2, 2, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("nms.cpu_nms.cpu_nms", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_dets), __pyx_ptype_5numpy_ndarray, 1, "dets", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_thresh), (&PyFloat_Type), 1, "thresh", 1))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_r = __pyx_pf_3nms_7cpu_nms_cpu_nms(__pyx_self, __pyx_v_dets, __pyx_v_thresh);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_3nms_7cpu_nms_cpu_nms(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_dets, PyObject *__pyx_v_thresh) {
+  PyArrayObject *__pyx_v_x1 = 0;
+  PyArrayObject *__pyx_v_y1 = 0;
+  PyArrayObject *__pyx_v_x2 = 0;
+  PyArrayObject *__pyx_v_y2 = 0;
+  PyArrayObject *__pyx_v_scores = 0;
+  PyArrayObject *__pyx_v_areas = 0;
+  PyArrayObject *__pyx_v_order = 0;
+  int __pyx_v_ndets;
+  PyArrayObject *__pyx_v_suppressed = 0;
+  int __pyx_v__i;
+  int __pyx_v__j;
+  int __pyx_v_i;
+  int __pyx_v_j;
+  __pyx_t_5numpy_float32_t __pyx_v_ix1;
+  __pyx_t_5numpy_float32_t __pyx_v_iy1;
+  __pyx_t_5numpy_float32_t __pyx_v_ix2;
+  __pyx_t_5numpy_float32_t __pyx_v_iy2;
+  __pyx_t_5numpy_float32_t __pyx_v_iarea;
+  __pyx_t_5numpy_float32_t __pyx_v_xx1;
+  __pyx_t_5numpy_float32_t __pyx_v_yy1;
+  __pyx_t_5numpy_float32_t __pyx_v_xx2;
+  __pyx_t_5numpy_float32_t __pyx_v_yy2;
+  __pyx_t_5numpy_float32_t __pyx_v_w;
+  __pyx_t_5numpy_float32_t __pyx_v_h;
+  __pyx_t_5numpy_float32_t __pyx_v_inter;
+  __pyx_t_5numpy_float32_t __pyx_v_ovr;
+  PyObject *__pyx_v_keep = NULL;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_areas;
+  __Pyx_Buffer __pyx_pybuffer_areas;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_dets;
+  __Pyx_Buffer __pyx_pybuffer_dets;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_order;
+  __Pyx_Buffer __pyx_pybuffer_order;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_scores;
+  __Pyx_Buffer __pyx_pybuffer_scores;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_suppressed;
+  __Pyx_Buffer __pyx_pybuffer_suppressed;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_x1;
+  __Pyx_Buffer __pyx_pybuffer_x1;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_x2;
+  __Pyx_Buffer __pyx_pybuffer_x2;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_y1;
+  __Pyx_Buffer __pyx_pybuffer_y1;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_y2;
+  __Pyx_Buffer __pyx_pybuffer_y2;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyArrayObject *__pyx_t_2 = NULL;
+  PyArrayObject *__pyx_t_3 = NULL;
+  PyArrayObject *__pyx_t_4 = NULL;
+  PyArrayObject *__pyx_t_5 = NULL;
+  PyArrayObject *__pyx_t_6 = NULL;
+  PyObject *__pyx_t_7 = NULL;
+  PyObject *__pyx_t_8 = NULL;
+  PyArrayObject *__pyx_t_9 = NULL;
+  PyArrayObject *__pyx_t_10 = NULL;
+  PyObject *__pyx_t_11 = NULL;
+  PyObject *__pyx_t_12 = NULL;
+  PyArrayObject *__pyx_t_13 = NULL;
+  int __pyx_t_14;
+  int __pyx_t_15;
+  int __pyx_t_16;
+  int __pyx_t_17;
+  int __pyx_t_18;
+  int __pyx_t_19;
+  int __pyx_t_20;
+  int __pyx_t_21;
+  int __pyx_t_22;
+  int __pyx_t_23;
+  int __pyx_t_24;
+  int __pyx_t_25;
+  int __pyx_t_26;
+  int __pyx_t_27;
+  int __pyx_t_28;
+  int __pyx_t_29;
+  int __pyx_t_30;
+  int __pyx_t_31;
+  int __pyx_t_32;
+  int __pyx_t_33;
+  int __pyx_t_34;
+  __pyx_t_5numpy_float32_t __pyx_t_35;
+  int __pyx_t_36;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("cpu_nms", 0);
+  __pyx_pybuffer_x1.pybuffer.buf = NULL;
+  __pyx_pybuffer_x1.refcount = 0;
+  __pyx_pybuffernd_x1.data = NULL;
+  __pyx_pybuffernd_x1.rcbuffer = &__pyx_pybuffer_x1;
+  __pyx_pybuffer_y1.pybuffer.buf = NULL;
+  __pyx_pybuffer_y1.refcount = 0;
+  __pyx_pybuffernd_y1.data = NULL;
+  __pyx_pybuffernd_y1.rcbuffer = &__pyx_pybuffer_y1;
+  __pyx_pybuffer_x2.pybuffer.buf = NULL;
+  __pyx_pybuffer_x2.refcount = 0;
+  __pyx_pybuffernd_x2.data = NULL;
+  __pyx_pybuffernd_x2.rcbuffer = &__pyx_pybuffer_x2;
+  __pyx_pybuffer_y2.pybuffer.buf = NULL;
+  __pyx_pybuffer_y2.refcount = 0;
+  __pyx_pybuffernd_y2.data = NULL;
+  __pyx_pybuffernd_y2.rcbuffer = &__pyx_pybuffer_y2;
+  __pyx_pybuffer_scores.pybuffer.buf = NULL;
+  __pyx_pybuffer_scores.refcount = 0;
+  __pyx_pybuffernd_scores.data = NULL;
+  __pyx_pybuffernd_scores.rcbuffer = &__pyx_pybuffer_scores;
+  __pyx_pybuffer_areas.pybuffer.buf = NULL;
+  __pyx_pybuffer_areas.refcount = 0;
+  __pyx_pybuffernd_areas.data = NULL;
+  __pyx_pybuffernd_areas.rcbuffer = &__pyx_pybuffer_areas;
+  __pyx_pybuffer_order.pybuffer.buf = NULL;
+  __pyx_pybuffer_order.refcount = 0;
+  __pyx_pybuffernd_order.data = NULL;
+  __pyx_pybuffernd_order.rcbuffer = &__pyx_pybuffer_order;
+  __pyx_pybuffer_suppressed.pybuffer.buf = NULL;
+  __pyx_pybuffer_suppressed.refcount = 0;
+  __pyx_pybuffernd_suppressed.data = NULL;
+  __pyx_pybuffernd_suppressed.rcbuffer = &__pyx_pybuffer_suppressed;
+  __pyx_pybuffer_dets.pybuffer.buf = NULL;
+  __pyx_pybuffer_dets.refcount = 0;
+  __pyx_pybuffernd_dets.data = NULL;
+  __pyx_pybuffernd_dets.rcbuffer = &__pyx_pybuffer_dets;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_dets.rcbuffer->pybuffer, (PyObject*)__pyx_v_dets, &__Pyx_TypeInfo_nn___pyx_t_5numpy_float32_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  }
+  __pyx_pybuffernd_dets.diminfo[0].strides = __pyx_pybuffernd_dets.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_dets.diminfo[0].shape = __pyx_pybuffernd_dets.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_dets.diminfo[1].strides = __pyx_pybuffernd_dets.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_dets.diminfo[1].shape = __pyx_pybuffernd_dets.rcbuffer->pybuffer.shape[1];
+
+  /* "nms/cpu_nms.pyx":18
+ * 
+ * def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
+ *     cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
+ *     cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
+ */
+  __pyx_t_1 = PyObject_GetItem(((PyObject *)__pyx_v_dets), __pyx_tuple__2); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+  __Pyx_GOTREF(__pyx_t_1);
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_2 = ((PyArrayObject *)__pyx_t_1);
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_x1.rcbuffer->pybuffer, (PyObject*)__pyx_t_2, &__Pyx_TypeInfo_nn___pyx_t_5numpy_float32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {
+      __pyx_v_x1 = ((PyArrayObject *)Py_None); __Pyx_INCREF(Py_None); __pyx_pybuffernd_x1.rcbuffer->pybuffer.buf = NULL;
+      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    } else {__pyx_pybuffernd_x1.diminfo[0].strides = __pyx_pybuffernd_x1.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_x1.diminfo[0].shape = __pyx_pybuffernd_x1.rcbuffer->pybuffer.shape[0];
+    }
+  }
+  __pyx_t_2 = 0;
+  __pyx_v_x1 = ((PyArrayObject *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "nms/cpu_nms.pyx":19
+ * def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
+ *     cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
+ *     cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
+ *     cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
+ */
+  __pyx_t_1 = PyObject_GetItem(((PyObject *)__pyx_v_dets), __pyx_tuple__4); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+  __Pyx_GOTREF(__pyx_t_1);
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_3 = ((PyArrayObject *)__pyx_t_1);
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_y1.rcbuffer->pybuffer, (PyObject*)__pyx_t_3, &__Pyx_TypeInfo_nn___pyx_t_5numpy_float32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {
+      __pyx_v_y1 = ((PyArrayObject *)Py_None); __Pyx_INCREF(Py_None); __pyx_pybuffernd_y1.rcbuffer->pybuffer.buf = NULL;
+      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    } else {__pyx_pybuffernd_y1.diminfo[0].strides = __pyx_pybuffernd_y1.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_y1.diminfo[0].shape = __pyx_pybuffernd_y1.rcbuffer->pybuffer.shape[0];
+    }
+  }
+  __pyx_t_3 = 0;
+  __pyx_v_y1 = ((PyArrayObject *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "nms/cpu_nms.pyx":20
+ *     cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
+ *     cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
+ *     cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
+ *     cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
+ */
+  __pyx_t_1 = PyObject_GetItem(((PyObject *)__pyx_v_dets), __pyx_tuple__6); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+  __Pyx_GOTREF(__pyx_t_1);
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_4 = ((PyArrayObject *)__pyx_t_1);
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_x2.rcbuffer->pybuffer, (PyObject*)__pyx_t_4, &__Pyx_TypeInfo_nn___pyx_t_5numpy_float32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {
+      __pyx_v_x2 = ((PyArrayObject *)Py_None); __Pyx_INCREF(Py_None); __pyx_pybuffernd_x2.rcbuffer->pybuffer.buf = NULL;
+      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    } else {__pyx_pybuffernd_x2.diminfo[0].strides = __pyx_pybuffernd_x2.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_x2.diminfo[0].shape = __pyx_pybuffernd_x2.rcbuffer->pybuffer.shape[0];
+    }
+  }
+  __pyx_t_4 = 0;
+  __pyx_v_x2 = ((PyArrayObject *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "nms/cpu_nms.pyx":21
+ *     cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
+ *     cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
+ *     cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
+ * 
+ */
+  __pyx_t_1 = PyObject_GetItem(((PyObject *)__pyx_v_dets), __pyx_tuple__8); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+  __Pyx_GOTREF(__pyx_t_1);
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_5 = ((PyArrayObject *)__pyx_t_1);
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_y2.rcbuffer->pybuffer, (PyObject*)__pyx_t_5, &__Pyx_TypeInfo_nn___pyx_t_5numpy_float32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {
+      __pyx_v_y2 = ((PyArrayObject *)Py_None); __Pyx_INCREF(Py_None); __pyx_pybuffernd_y2.rcbuffer->pybuffer.buf = NULL;
+      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    } else {__pyx_pybuffernd_y2.diminfo[0].strides = __pyx_pybuffernd_y2.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_y2.diminfo[0].shape = __pyx_pybuffernd_y2.rcbuffer->pybuffer.shape[0];
+    }
+  }
+  __pyx_t_5 = 0;
+  __pyx_v_y2 = ((PyArrayObject *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "nms/cpu_nms.pyx":22
+ *     cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
+ *     cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
+ *     cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]             # <<<<<<<<<<<<<<
+ * 
+ *     cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+ */
+  __pyx_t_1 = PyObject_GetItem(((PyObject *)__pyx_v_dets), __pyx_tuple__10); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+  __Pyx_GOTREF(__pyx_t_1);
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_6 = ((PyArrayObject *)__pyx_t_1);
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_scores.rcbuffer->pybuffer, (PyObject*)__pyx_t_6, &__Pyx_TypeInfo_nn___pyx_t_5numpy_float32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {
+      __pyx_v_scores = ((PyArrayObject *)Py_None); __Pyx_INCREF(Py_None); __pyx_pybuffernd_scores.rcbuffer->pybuffer.buf = NULL;
+      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    } else {__pyx_pybuffernd_scores.diminfo[0].strides = __pyx_pybuffernd_scores.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_scores.diminfo[0].shape = __pyx_pybuffernd_scores.rcbuffer->pybuffer.shape[0];
+    }
+  }
+  __pyx_t_6 = 0;
+  __pyx_v_scores = ((PyArrayObject *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "nms/cpu_nms.pyx":24
+ *     cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
+ * 
+ *     cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
+ * 
+ */
+  __pyx_t_1 = PyNumber_Subtract(((PyObject *)__pyx_v_x2), ((PyObject *)__pyx_v_x1)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 24; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_7 = PyNumber_Add(__pyx_t_1, __pyx_int_1); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 24; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_7);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = PyNumber_Subtract(((PyObject *)__pyx_v_y2), ((PyObject *)__pyx_v_y1)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 24; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_8 = PyNumber_Add(__pyx_t_1, __pyx_int_1); if (unlikely(!__pyx_t_8)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 24; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_8);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = PyNumber_Multiply(__pyx_t_7, __pyx_t_8); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 24; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+  __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 24; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_9 = ((PyArrayObject *)__pyx_t_1);
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_areas.rcbuffer->pybuffer, (PyObject*)__pyx_t_9, &__Pyx_TypeInfo_nn___pyx_t_5numpy_float32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {
+      __pyx_v_areas = ((PyArrayObject *)Py_None); __Pyx_INCREF(Py_None); __pyx_pybuffernd_areas.rcbuffer->pybuffer.buf = NULL;
+      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 24; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    } else {__pyx_pybuffernd_areas.diminfo[0].strides = __pyx_pybuffernd_areas.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_areas.diminfo[0].shape = __pyx_pybuffernd_areas.rcbuffer->pybuffer.shape[0];
+    }
+  }
+  __pyx_t_9 = 0;
+  __pyx_v_areas = ((PyArrayObject *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "nms/cpu_nms.pyx":25
+ * 
+ *     cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+ *     cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]             # <<<<<<<<<<<<<<
+ * 
+ *     cdef int ndets = dets.shape[0]
+ */
+  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_scores), __pyx_n_s_argsort); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 25; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_8 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_empty_tuple, NULL); if (unlikely(!__pyx_t_8)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 25; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_8);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = PyObject_GetItem(__pyx_t_8, __pyx_slice__11); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 25; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 25; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_10 = ((PyArrayObject *)__pyx_t_1);
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_order.rcbuffer->pybuffer, (PyObject*)__pyx_t_10, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {
+      __pyx_v_order = ((PyArrayObject *)Py_None); __Pyx_INCREF(Py_None); __pyx_pybuffernd_order.rcbuffer->pybuffer.buf = NULL;
+      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 25; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    } else {__pyx_pybuffernd_order.diminfo[0].strides = __pyx_pybuffernd_order.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_order.diminfo[0].shape = __pyx_pybuffernd_order.rcbuffer->pybuffer.shape[0];
+    }
+  }
+  __pyx_t_10 = 0;
+  __pyx_v_order = ((PyArrayObject *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "nms/cpu_nms.pyx":27
+ *     cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
+ * 
+ *     cdef int ndets = dets.shape[0]             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.int_t, ndim=1] suppressed = \
+ *             np.zeros((ndets), dtype=np.int)
+ */
+  __pyx_v_ndets = (__pyx_v_dets->dimensions[0]);
+
+  /* "nms/cpu_nms.pyx":29
+ *     cdef int ndets = dets.shape[0]
+ *     cdef np.ndarray[np.int_t, ndim=1] suppressed = \
+ *             np.zeros((ndets), dtype=np.int)             # <<<<<<<<<<<<<<
+ * 
+ *     # nominal indices
+ */
+  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 29; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_8 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_zeros); if (unlikely(!__pyx_t_8)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 29; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_8);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_ndets); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 29; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_7 = PyTuple_New(1); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 29; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_7);
+  PyTuple_SET_ITEM(__pyx_t_7, 0, __pyx_t_1);
+  __Pyx_GIVEREF(__pyx_t_1);
+  __pyx_t_1 = 0;
+  __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 29; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_11 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_11)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 29; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_11);
+  __pyx_t_12 = __Pyx_PyObject_GetAttrStr(__pyx_t_11, __pyx_n_s_int); if (unlikely(!__pyx_t_12)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 29; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_12);
+  __Pyx_DECREF(__pyx_t_11); __pyx_t_11 = 0;
+  if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_dtype, __pyx_t_12) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 29; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;
+  __pyx_t_12 = __Pyx_PyObject_Call(__pyx_t_8, __pyx_t_7, __pyx_t_1); if (unlikely(!__pyx_t_12)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 29; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_12);
+  __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+  __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0;
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (!(likely(((__pyx_t_12) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_12, __pyx_ptype_5numpy_ndarray))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 29; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_13 = ((PyArrayObject *)__pyx_t_12);
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_suppressed.rcbuffer->pybuffer, (PyObject*)__pyx_t_13, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int_t, PyBUF_FORMAT| PyBUF_STRIDES| PyBUF_WRITABLE, 1, 0, __pyx_stack) == -1)) {
+      __pyx_v_suppressed = ((PyArrayObject *)Py_None); __Pyx_INCREF(Py_None); __pyx_pybuffernd_suppressed.rcbuffer->pybuffer.buf = NULL;
+      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    } else {__pyx_pybuffernd_suppressed.diminfo[0].strides = __pyx_pybuffernd_suppressed.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_suppressed.diminfo[0].shape = __pyx_pybuffernd_suppressed.rcbuffer->pybuffer.shape[0];
+    }
+  }
+  __pyx_t_13 = 0;
+  __pyx_v_suppressed = ((PyArrayObject *)__pyx_t_12);
+  __pyx_t_12 = 0;
+
+  /* "nms/cpu_nms.pyx":42
+ *     cdef np.float32_t inter, ovr
+ * 
+ *     keep = []             # <<<<<<<<<<<<<<
+ *     for _i in range(ndets):
+ *         i = order[_i]
+ */
+  __pyx_t_12 = PyList_New(0); if (unlikely(!__pyx_t_12)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_12);
+  __pyx_v_keep = ((PyObject*)__pyx_t_12);
+  __pyx_t_12 = 0;
+
+  /* "nms/cpu_nms.pyx":43
+ * 
+ *     keep = []
+ *     for _i in range(ndets):             # <<<<<<<<<<<<<<
+ *         i = order[_i]
+ *         if suppressed[i] == 1:
+ */
+  __pyx_t_14 = __pyx_v_ndets;
+  for (__pyx_t_15 = 0; __pyx_t_15 < __pyx_t_14; __pyx_t_15+=1) {
+    __pyx_v__i = __pyx_t_15;
+
+    /* "nms/cpu_nms.pyx":44
+ *     keep = []
+ *     for _i in range(ndets):
+ *         i = order[_i]             # <<<<<<<<<<<<<<
+ *         if suppressed[i] == 1:
+ *             continue
+ */
+    __pyx_t_16 = __pyx_v__i;
+    __pyx_t_17 = -1;
+    if (__pyx_t_16 < 0) {
+      __pyx_t_16 += __pyx_pybuffernd_order.diminfo[0].shape;
+      if (unlikely(__pyx_t_16 < 0)) __pyx_t_17 = 0;
+    } else if (unlikely(__pyx_t_16 >= __pyx_pybuffernd_order.diminfo[0].shape)) __pyx_t_17 = 0;
+    if (unlikely(__pyx_t_17 != -1)) {
+      __Pyx_RaiseBufferIndexError(__pyx_t_17);
+      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 44; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    }
+    __pyx_v_i = (*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int_t *, __pyx_pybuffernd_order.rcbuffer->pybuffer.buf, __pyx_t_16, __pyx_pybuffernd_order.diminfo[0].strides));
+
+    /* "nms/cpu_nms.pyx":45
+ *     for _i in range(ndets):
+ *         i = order[_i]
+ *         if suppressed[i] == 1:             # <<<<<<<<<<<<<<
+ *             continue
+ *         keep.append(i)
+ */
+    __pyx_t_17 = __pyx_v_i;
+    __pyx_t_18 = -1;
+    if (__pyx_t_17 < 0) {
+      __pyx_t_17 += __pyx_pybuffernd_suppressed.diminfo[0].shape;
+      if (unlikely(__pyx_t_17 < 0)) __pyx_t_18 = 0;
+    } else if (unlikely(__pyx_t_17 >= __pyx_pybuffernd_suppressed.diminfo[0].shape)) __pyx_t_18 = 0;
+    if (unlikely(__pyx_t_18 != -1)) {
+      __Pyx_RaiseBufferIndexError(__pyx_t_18);
+      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 45; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    }
+    __pyx_t_19 = (((*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int_t *, __pyx_pybuffernd_suppressed.rcbuffer->pybuffer.buf, __pyx_t_17, __pyx_pybuffernd_suppressed.diminfo[0].strides)) == 1) != 0);
+    if (__pyx_t_19) {
+
+      /* "nms/cpu_nms.pyx":46
+ *         i = order[_i]
+ *         if suppressed[i] == 1:
+ *             continue             # <<<<<<<<<<<<<<
+ *         keep.append(i)
+ *         ix1 = x1[i]
+ */
+      goto __pyx_L3_continue;
+    }
+
+    /* "nms/cpu_nms.pyx":47
+ *         if suppressed[i] == 1:
+ *             continue
+ *         keep.append(i)             # <<<<<<<<<<<<<<
+ *         ix1 = x1[i]
+ *         iy1 = y1[i]
+ */
+    __pyx_t_12 = __Pyx_PyInt_From_int(__pyx_v_i); if (unlikely(!__pyx_t_12)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 47; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_12);
+    __pyx_t_20 = __Pyx_PyList_Append(__pyx_v_keep, __pyx_t_12); if (unlikely(__pyx_t_20 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 47; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;
+
+    /* "nms/cpu_nms.pyx":48
+ *             continue
+ *         keep.append(i)
+ *         ix1 = x1[i]             # <<<<<<<<<<<<<<
+ *         iy1 = y1[i]
+ *         ix2 = x2[i]
+ */
+    __pyx_t_18 = __pyx_v_i;
+    __pyx_t_21 = -1;
+    if (__pyx_t_18 < 0) {
+      __pyx_t_18 += __pyx_pybuffernd_x1.diminfo[0].shape;
+      if (unlikely(__pyx_t_18 < 0)) __pyx_t_21 = 0;
+    } else if (unlikely(__pyx_t_18 >= __pyx_pybuffernd_x1.diminfo[0].shape)) __pyx_t_21 = 0;
+    if (unlikely(__pyx_t_21 != -1)) {
+      __Pyx_RaiseBufferIndexError(__pyx_t_21);
+      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 48; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    }
+    __pyx_v_ix1 = (*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_float32_t *, __pyx_pybuffernd_x1.rcbuffer->pybuffer.buf, __pyx_t_18, __pyx_pybuffernd_x1.diminfo[0].strides));
+
+    /* "nms/cpu_nms.pyx":49
+ *         keep.append(i)
+ *         ix1 = x1[i]
+ *         iy1 = y1[i]             # <<<<<<<<<<<<<<
+ *         ix2 = x2[i]
+ *         iy2 = y2[i]
+ */
+    __pyx_t_21 = __pyx_v_i;
+    __pyx_t_22 = -1;
+    if (__pyx_t_21 < 0) {
+      __pyx_t_21 += __pyx_pybuffernd_y1.diminfo[0].shape;
+      if (unlikely(__pyx_t_21 < 0)) __pyx_t_22 = 0;
+    } else if (unlikely(__pyx_t_21 >= __pyx_pybuffernd_y1.diminfo[0].shape)) __pyx_t_22 = 0;
+    if (unlikely(__pyx_t_22 != -1)) {
+      __Pyx_RaiseBufferIndexError(__pyx_t_22);
+      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 49; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    }
+    __pyx_v_iy1 = (*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_float32_t *, __pyx_pybuffernd_y1.rcbuffer->pybuffer.buf, __pyx_t_21, __pyx_pybuffernd_y1.diminfo[0].strides));
+
+    /* "nms/cpu_nms.pyx":50
+ *         ix1 = x1[i]
+ *         iy1 = y1[i]
+ *         ix2 = x2[i]             # <<<<<<<<<<<<<<
+ *         iy2 = y2[i]
+ *         iarea = areas[i]
+ */
+    __pyx_t_22 = __pyx_v_i;
+    __pyx_t_23 = -1;
+    if (__pyx_t_22 < 0) {
+      __pyx_t_22 += __pyx_pybuffernd_x2.diminfo[0].shape;
+      if (unlikely(__pyx_t_22 < 0)) __pyx_t_23 = 0;
+    } else if (unlikely(__pyx_t_22 >= __pyx_pybuffernd_x2.diminfo[0].shape)) __pyx_t_23 = 0;
+    if (unlikely(__pyx_t_23 != -1)) {
+      __Pyx_RaiseBufferIndexError(__pyx_t_23);
+      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 50; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    }
+    __pyx_v_ix2 = (*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_float32_t *, __pyx_pybuffernd_x2.rcbuffer->pybuffer.buf, __pyx_t_22, __pyx_pybuffernd_x2.diminfo[0].strides));
+
+    /* "nms/cpu_nms.pyx":51
+ *         iy1 = y1[i]
+ *         ix2 = x2[i]
+ *         iy2 = y2[i]             # <<<<<<<<<<<<<<
+ *         iarea = areas[i]
+ *         for _j in range(_i + 1, ndets):
+ */
+    __pyx_t_23 = __pyx_v_i;
+    __pyx_t_24 = -1;
+    if (__pyx_t_23 < 0) {
+      __pyx_t_23 += __pyx_pybuffernd_y2.diminfo[0].shape;
+      if (unlikely(__pyx_t_23 < 0)) __pyx_t_24 = 0;
+    } else if (unlikely(__pyx_t_23 >= __pyx_pybuffernd_y2.diminfo[0].shape)) __pyx_t_24 = 0;
+    if (unlikely(__pyx_t_24 != -1)) {
+      __Pyx_RaiseBufferIndexError(__pyx_t_24);
+      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 51; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    }
+    __pyx_v_iy2 = (*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_float32_t *, __pyx_pybuffernd_y2.rcbuffer->pybuffer.buf, __pyx_t_23, __pyx_pybuffernd_y2.diminfo[0].strides));
+
+    /* "nms/cpu_nms.pyx":52
+ *         ix2 = x2[i]
+ *         iy2 = y2[i]
+ *         iarea = areas[i]             # <<<<<<<<<<<<<<
+ *         for _j in range(_i + 1, ndets):
+ *             j = order[_j]
+ */
+    __pyx_t_24 = __pyx_v_i;
+    __pyx_t_25 = -1;
+    if (__pyx_t_24 < 0) {
+      __pyx_t_24 += __pyx_pybuffernd_areas.diminfo[0].shape;
+      if (unlikely(__pyx_t_24 < 0)) __pyx_t_25 = 0;
+    } else if (unlikely(__pyx_t_24 >= __pyx_pybuffernd_areas.diminfo[0].shape)) __pyx_t_25 = 0;
+    if (unlikely(__pyx_t_25 != -1)) {
+      __Pyx_RaiseBufferIndexError(__pyx_t_25);
+      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    }
+    __pyx_v_iarea = (*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_float32_t *, __pyx_pybuffernd_areas.rcbuffer->pybuffer.buf, __pyx_t_24, __pyx_pybuffernd_areas.diminfo[0].strides));
+
+    /* "nms/cpu_nms.pyx":53
+ *         iy2 = y2[i]
+ *         iarea = areas[i]
+ *         for _j in range(_i + 1, ndets):             # <<<<<<<<<<<<<<
+ *             j = order[_j]
+ *             if suppressed[j] == 1:
+ */
+    __pyx_t_25 = __pyx_v_ndets;
+    for (__pyx_t_26 = (__pyx_v__i + 1); __pyx_t_26 < __pyx_t_25; __pyx_t_26+=1) {
+      __pyx_v__j = __pyx_t_26;
+
+      /* "nms/cpu_nms.pyx":54
+ *         iarea = areas[i]
+ *         for _j in range(_i + 1, ndets):
+ *             j = order[_j]             # <<<<<<<<<<<<<<
+ *             if suppressed[j] == 1:
+ *                 continue
+ */
+      __pyx_t_27 = __pyx_v__j;
+      __pyx_t_28 = -1;
+      if (__pyx_t_27 < 0) {
+        __pyx_t_27 += __pyx_pybuffernd_order.diminfo[0].shape;
+        if (unlikely(__pyx_t_27 < 0)) __pyx_t_28 = 0;
+      } else if (unlikely(__pyx_t_27 >= __pyx_pybuffernd_order.diminfo[0].shape)) __pyx_t_28 = 0;
+      if (unlikely(__pyx_t_28 != -1)) {
+        __Pyx_RaiseBufferIndexError(__pyx_t_28);
+        {__pyx_filename = __pyx_f[0]; __pyx_lineno = 54; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      }
+      __pyx_v_j = (*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int_t *, __pyx_pybuffernd_order.rcbuffer->pybuffer.buf, __pyx_t_27, __pyx_pybuffernd_order.diminfo[0].strides));
+
+      /* "nms/cpu_nms.pyx":55
+ *         for _j in range(_i + 1, ndets):
+ *             j = order[_j]
+ *             if suppressed[j] == 1:             # <<<<<<<<<<<<<<
+ *                 continue
+ *             xx1 = max(ix1, x1[j])
+ */
+      __pyx_t_28 = __pyx_v_j;
+      __pyx_t_29 = -1;
+      if (__pyx_t_28 < 0) {
+        __pyx_t_28 += __pyx_pybuffernd_suppressed.diminfo[0].shape;
+        if (unlikely(__pyx_t_28 < 0)) __pyx_t_29 = 0;
+      } else if (unlikely(__pyx_t_28 >= __pyx_pybuffernd_suppressed.diminfo[0].shape)) __pyx_t_29 = 0;
+      if (unlikely(__pyx_t_29 != -1)) {
+        __Pyx_RaiseBufferIndexError(__pyx_t_29);
+        {__pyx_filename = __pyx_f[0]; __pyx_lineno = 55; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      }
+      __pyx_t_19 = (((*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int_t *, __pyx_pybuffernd_suppressed.rcbuffer->pybuffer.buf, __pyx_t_28, __pyx_pybuffernd_suppressed.diminfo[0].strides)) == 1) != 0);
+      if (__pyx_t_19) {
+
+        /* "nms/cpu_nms.pyx":56
+ *             j = order[_j]
+ *             if suppressed[j] == 1:
+ *                 continue             # <<<<<<<<<<<<<<
+ *             xx1 = max(ix1, x1[j])
+ *             yy1 = max(iy1, y1[j])
+ */
+        goto __pyx_L6_continue;
+      }
+
+      /* "nms/cpu_nms.pyx":57
+ *             if suppressed[j] == 1:
+ *                 continue
+ *             xx1 = max(ix1, x1[j])             # <<<<<<<<<<<<<<
+ *             yy1 = max(iy1, y1[j])
+ *             xx2 = min(ix2, x2[j])
+ */
+      __pyx_t_29 = __pyx_v_j;
+      __pyx_t_30 = -1;
+      if (__pyx_t_29 < 0) {
+        __pyx_t_29 += __pyx_pybuffernd_x1.diminfo[0].shape;
+        if (unlikely(__pyx_t_29 < 0)) __pyx_t_30 = 0;
+      } else if (unlikely(__pyx_t_29 >= __pyx_pybuffernd_x1.diminfo[0].shape)) __pyx_t_30 = 0;
+      if (unlikely(__pyx_t_30 != -1)) {
+        __Pyx_RaiseBufferIndexError(__pyx_t_30);
+        {__pyx_filename = __pyx_f[0]; __pyx_lineno = 57; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      }
+      __pyx_v_xx1 = __pyx_f_3nms_7cpu_nms_max(__pyx_v_ix1, (*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_float32_t *, __pyx_pybuffernd_x1.rcbuffer->pybuffer.buf, __pyx_t_29, __pyx_pybuffernd_x1.diminfo[0].strides)));
+
+      /* "nms/cpu_nms.pyx":58
+ *                 continue
+ *             xx1 = max(ix1, x1[j])
+ *             yy1 = max(iy1, y1[j])             # <<<<<<<<<<<<<<
+ *             xx2 = min(ix2, x2[j])
+ *             yy2 = min(iy2, y2[j])
+ */
+      __pyx_t_30 = __pyx_v_j;
+      __pyx_t_31 = -1;
+      if (__pyx_t_30 < 0) {
+        __pyx_t_30 += __pyx_pybuffernd_y1.diminfo[0].shape;
+        if (unlikely(__pyx_t_30 < 0)) __pyx_t_31 = 0;
+      } else if (unlikely(__pyx_t_30 >= __pyx_pybuffernd_y1.diminfo[0].shape)) __pyx_t_31 = 0;
+      if (unlikely(__pyx_t_31 != -1)) {
+        __Pyx_RaiseBufferIndexError(__pyx_t_31);
+        {__pyx_filename = __pyx_f[0]; __pyx_lineno = 58; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      }
+      __pyx_v_yy1 = __pyx_f_3nms_7cpu_nms_max(__pyx_v_iy1, (*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_float32_t *, __pyx_pybuffernd_y1.rcbuffer->pybuffer.buf, __pyx_t_30, __pyx_pybuffernd_y1.diminfo[0].strides)));
+
+      /* "nms/cpu_nms.pyx":59
+ *             xx1 = max(ix1, x1[j])
+ *             yy1 = max(iy1, y1[j])
+ *             xx2 = min(ix2, x2[j])             # <<<<<<<<<<<<<<
+ *             yy2 = min(iy2, y2[j])
+ *             w = max(0.0, xx2 - xx1 + 1)
+ */
+      __pyx_t_31 = __pyx_v_j;
+      __pyx_t_32 = -1;
+      if (__pyx_t_31 < 0) {
+        __pyx_t_31 += __pyx_pybuffernd_x2.diminfo[0].shape;
+        if (unlikely(__pyx_t_31 < 0)) __pyx_t_32 = 0;
+      } else if (unlikely(__pyx_t_31 >= __pyx_pybuffernd_x2.diminfo[0].shape)) __pyx_t_32 = 0;
+      if (unlikely(__pyx_t_32 != -1)) {
+        __Pyx_RaiseBufferIndexError(__pyx_t_32);
+        {__pyx_filename = __pyx_f[0]; __pyx_lineno = 59; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      }
+      __pyx_v_xx2 = __pyx_f_3nms_7cpu_nms_min(__pyx_v_ix2, (*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_float32_t *, __pyx_pybuffernd_x2.rcbuffer->pybuffer.buf, __pyx_t_31, __pyx_pybuffernd_x2.diminfo[0].strides)));
+
+      /* "nms/cpu_nms.pyx":60
+ *             yy1 = max(iy1, y1[j])
+ *             xx2 = min(ix2, x2[j])
+ *             yy2 = min(iy2, y2[j])             # <<<<<<<<<<<<<<
+ *             w = max(0.0, xx2 - xx1 + 1)
+ *             h = max(0.0, yy2 - yy1 + 1)
+ */
+      __pyx_t_32 = __pyx_v_j;
+      __pyx_t_33 = -1;
+      if (__pyx_t_32 < 0) {
+        __pyx_t_32 += __pyx_pybuffernd_y2.diminfo[0].shape;
+        if (unlikely(__pyx_t_32 < 0)) __pyx_t_33 = 0;
+      } else if (unlikely(__pyx_t_32 >= __pyx_pybuffernd_y2.diminfo[0].shape)) __pyx_t_33 = 0;
+      if (unlikely(__pyx_t_33 != -1)) {
+        __Pyx_RaiseBufferIndexError(__pyx_t_33);
+        {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      }
+      __pyx_v_yy2 = __pyx_f_3nms_7cpu_nms_min(__pyx_v_iy2, (*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_float32_t *, __pyx_pybuffernd_y2.rcbuffer->pybuffer.buf, __pyx_t_32, __pyx_pybuffernd_y2.diminfo[0].strides)));
+
+      /* "nms/cpu_nms.pyx":61
+ *             xx2 = min(ix2, x2[j])
+ *             yy2 = min(iy2, y2[j])
+ *             w = max(0.0, xx2 - xx1 + 1)             # <<<<<<<<<<<<<<
+ *             h = max(0.0, yy2 - yy1 + 1)
+ *             inter = w * h
+ */
+      __pyx_v_w = __pyx_f_3nms_7cpu_nms_max(0.0, ((__pyx_v_xx2 - __pyx_v_xx1) + 1.0));
+
+      /* "nms/cpu_nms.pyx":62
+ *             yy2 = min(iy2, y2[j])
+ *             w = max(0.0, xx2 - xx1 + 1)
+ *             h = max(0.0, yy2 - yy1 + 1)             # <<<<<<<<<<<<<<
+ *             inter = w * h
+ *             ovr = inter / (iarea + areas[j] - inter)
+ */
+      __pyx_v_h = __pyx_f_3nms_7cpu_nms_max(0.0, ((__pyx_v_yy2 - __pyx_v_yy1) + 1.0));
+
+      /* "nms/cpu_nms.pyx":63
+ *             w = max(0.0, xx2 - xx1 + 1)
+ *             h = max(0.0, yy2 - yy1 + 1)
+ *             inter = w * h             # <<<<<<<<<<<<<<
+ *             ovr = inter / (iarea + areas[j] - inter)
+ *             if ovr >= thresh:
+ */
+      __pyx_v_inter = (__pyx_v_w * __pyx_v_h);
+
+      /* "nms/cpu_nms.pyx":64
+ *             h = max(0.0, yy2 - yy1 + 1)
+ *             inter = w * h
+ *             ovr = inter / (iarea + areas[j] - inter)             # <<<<<<<<<<<<<<
+ *             if ovr >= thresh:
+ *                 suppressed[j] = 1
+ */
+      __pyx_t_33 = __pyx_v_j;
+      __pyx_t_34 = -1;
+      if (__pyx_t_33 < 0) {
+        __pyx_t_33 += __pyx_pybuffernd_areas.diminfo[0].shape;
+        if (unlikely(__pyx_t_33 < 0)) __pyx_t_34 = 0;
+      } else if (unlikely(__pyx_t_33 >= __pyx_pybuffernd_areas.diminfo[0].shape)) __pyx_t_34 = 0;
+      if (unlikely(__pyx_t_34 != -1)) {
+        __Pyx_RaiseBufferIndexError(__pyx_t_34);
+        {__pyx_filename = __pyx_f[0]; __pyx_lineno = 64; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      }
+      __pyx_t_35 = ((__pyx_v_iarea + (*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_float32_t *, __pyx_pybuffernd_areas.rcbuffer->pybuffer.buf, __pyx_t_33, __pyx_pybuffernd_areas.diminfo[0].strides))) - __pyx_v_inter);
+      if (unlikely(__pyx_t_35 == 0)) {
+        #ifdef WITH_THREAD
+        PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure();
+        #endif
+        PyErr_SetString(PyExc_ZeroDivisionError, "float division");
+        #ifdef WITH_THREAD
+        PyGILState_Release(__pyx_gilstate_save);
+        #endif
+        {__pyx_filename = __pyx_f[0]; __pyx_lineno = 64; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      }
+      __pyx_v_ovr = (__pyx_v_inter / __pyx_t_35);
+
+      /* "nms/cpu_nms.pyx":65
+ *             inter = w * h
+ *             ovr = inter / (iarea + areas[j] - inter)
+ *             if ovr >= thresh:             # <<<<<<<<<<<<<<
+ *                 suppressed[j] = 1
+ * 
+ */
+      __pyx_t_12 = PyFloat_FromDouble(__pyx_v_ovr); if (unlikely(!__pyx_t_12)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 65; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_12);
+      __pyx_t_1 = PyObject_RichCompare(__pyx_t_12, __pyx_v_thresh, Py_GE); __Pyx_XGOTREF(__pyx_t_1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 65; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_12); __pyx_t_12 = 0;
+      __pyx_t_19 = __Pyx_PyObject_IsTrue(__pyx_t_1); if (unlikely(__pyx_t_19 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 65; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+      if (__pyx_t_19) {
+
+        /* "nms/cpu_nms.pyx":66
+ *             ovr = inter / (iarea + areas[j] - inter)
+ *             if ovr >= thresh:
+ *                 suppressed[j] = 1             # <<<<<<<<<<<<<<
+ * 
+ *     return keep
+ */
+        __pyx_t_34 = __pyx_v_j;
+        __pyx_t_36 = -1;
+        if (__pyx_t_34 < 0) {
+          __pyx_t_34 += __pyx_pybuffernd_suppressed.diminfo[0].shape;
+          if (unlikely(__pyx_t_34 < 0)) __pyx_t_36 = 0;
+        } else if (unlikely(__pyx_t_34 >= __pyx_pybuffernd_suppressed.diminfo[0].shape)) __pyx_t_36 = 0;
+        if (unlikely(__pyx_t_36 != -1)) {
+          __Pyx_RaiseBufferIndexError(__pyx_t_36);
+          {__pyx_filename = __pyx_f[0]; __pyx_lineno = 66; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        }
+        *__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int_t *, __pyx_pybuffernd_suppressed.rcbuffer->pybuffer.buf, __pyx_t_34, __pyx_pybuffernd_suppressed.diminfo[0].strides) = 1;
+        goto __pyx_L9;
+      }
+      __pyx_L9:;
+      __pyx_L6_continue:;
+    }
+    __pyx_L3_continue:;
+  }
+
+  /* "nms/cpu_nms.pyx":68
+ *                 suppressed[j] = 1
+ * 
+ *     return keep             # <<<<<<<<<<<<<<
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __Pyx_INCREF(__pyx_v_keep);
+  __pyx_r = __pyx_v_keep;
+  goto __pyx_L0;
+
+  /* "nms/cpu_nms.pyx":17
+ *     return a if a <= b else b
+ * 
+ * def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
+ *     cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_7);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_XDECREF(__pyx_t_11);
+  __Pyx_XDECREF(__pyx_t_12);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_areas.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_dets.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_order.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_scores.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_suppressed.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_x1.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_x2.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_y1.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_y2.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("nms.cpu_nms.cpu_nms", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_areas.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_dets.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_order.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_scores.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_suppressed.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_x1.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_x2.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_y1.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_y2.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_x1);
+  __Pyx_XDECREF((PyObject *)__pyx_v_y1);
+  __Pyx_XDECREF((PyObject *)__pyx_v_x2);
+  __Pyx_XDECREF((PyObject *)__pyx_v_y2);
+  __Pyx_XDECREF((PyObject *)__pyx_v_scores);
+  __Pyx_XDECREF((PyObject *)__pyx_v_areas);
+  __Pyx_XDECREF((PyObject *)__pyx_v_order);
+  __Pyx_XDECREF((PyObject *)__pyx_v_suppressed);
+  __Pyx_XDECREF(__pyx_v_keep);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":194
+ *         # experimental exception made for __getbuffer__ and __releasebuffer__
+ *         # -- the details of this may change.
+ *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
+ *             # This implementation of getbuffer is geared towards Cython
+ *             # requirements, and does not yet fullfill the PEP.
+ */
+
+/* Python wrapper */
+static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /*proto*/
+static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__getbuffer__ (wrapper)", 0);
+  __pyx_r = __pyx_pf_5numpy_7ndarray___getbuffer__(((PyArrayObject *)__pyx_v_self), ((Py_buffer *)__pyx_v_info), ((int)__pyx_v_flags));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+  int __pyx_v_copy_shape;
+  int __pyx_v_i;
+  int __pyx_v_ndim;
+  int __pyx_v_endian_detector;
+  int __pyx_v_little_endian;
+  int __pyx_v_t;
+  char *__pyx_v_f;
+  PyArray_Descr *__pyx_v_descr = 0;
+  int __pyx_v_offset;
+  int __pyx_v_hasfields;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_t_3;
+  PyObject *__pyx_t_4 = NULL;
+  int __pyx_t_5;
+  int __pyx_t_6;
+  int __pyx_t_7;
+  PyObject *__pyx_t_8 = NULL;
+  char *__pyx_t_9;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__getbuffer__", 0);
+  if (__pyx_v_info != NULL) {
+    __pyx_v_info->obj = Py_None; __Pyx_INCREF(Py_None);
+    __Pyx_GIVEREF(__pyx_v_info->obj);
+  }
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":200
+ *             # of flags
+ * 
+ *             if info == NULL: return             # <<<<<<<<<<<<<<
+ * 
+ *             cdef int copy_shape, i, ndim
+ */
+  __pyx_t_1 = ((__pyx_v_info == NULL) != 0);
+  if (__pyx_t_1) {
+    __pyx_r = 0;
+    goto __pyx_L0;
+  }
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":203
+ * 
+ *             cdef int copy_shape, i, ndim
+ *             cdef int endian_detector = 1             # <<<<<<<<<<<<<<
+ *             cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)
+ * 
+ */
+  __pyx_v_endian_detector = 1;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":204
+ *             cdef int copy_shape, i, ndim
+ *             cdef int endian_detector = 1
+ *             cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)             # <<<<<<<<<<<<<<
+ * 
+ *             ndim = PyArray_NDIM(self)
+ */
+  __pyx_v_little_endian = ((((char *)(&__pyx_v_endian_detector))[0]) != 0);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":206
+ *             cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)
+ * 
+ *             ndim = PyArray_NDIM(self)             # <<<<<<<<<<<<<<
+ * 
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ */
+  __pyx_v_ndim = PyArray_NDIM(__pyx_v_self);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":208
+ *             ndim = PyArray_NDIM(self)
+ * 
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):             # <<<<<<<<<<<<<<
+ *                 copy_shape = 1
+ *             else:
+ */
+  __pyx_t_1 = (((sizeof(npy_intp)) != (sizeof(Py_ssize_t))) != 0);
+  if (__pyx_t_1) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":209
+ * 
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ *                 copy_shape = 1             # <<<<<<<<<<<<<<
+ *             else:
+ *                 copy_shape = 0
+ */
+    __pyx_v_copy_shape = 1;
+    goto __pyx_L4;
+  }
+  /*else*/ {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":211
+ *                 copy_shape = 1
+ *             else:
+ *                 copy_shape = 0             # <<<<<<<<<<<<<<
+ * 
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ */
+    __pyx_v_copy_shape = 0;
+  }
+  __pyx_L4:;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":213
+ *                 copy_shape = 0
+ * 
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)             # <<<<<<<<<<<<<<
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ */
+  __pyx_t_1 = (((__pyx_v_flags & PyBUF_C_CONTIGUOUS) == PyBUF_C_CONTIGUOUS) != 0);
+  if (__pyx_t_1) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":214
+ * 
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):             # <<<<<<<<<<<<<<
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ * 
+ */
+    __pyx_t_2 = ((!(PyArray_CHKFLAGS(__pyx_v_self, NPY_C_CONTIGUOUS) != 0)) != 0);
+    __pyx_t_3 = __pyx_t_2;
+  } else {
+    __pyx_t_3 = __pyx_t_1;
+  }
+  if (__pyx_t_3) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":215
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not C contiguous")             # <<<<<<<<<<<<<<
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ */
+    __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__12, NULL); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 215; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_4);
+    __Pyx_Raise(__pyx_t_4, 0, 0, 0);
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 215; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  }
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":217
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)             # <<<<<<<<<<<<<<
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")
+ */
+  __pyx_t_3 = (((__pyx_v_flags & PyBUF_F_CONTIGUOUS) == PyBUF_F_CONTIGUOUS) != 0);
+  if (__pyx_t_3) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":218
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):             # <<<<<<<<<<<<<<
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")
+ * 
+ */
+    __pyx_t_1 = ((!(PyArray_CHKFLAGS(__pyx_v_self, NPY_F_CONTIGUOUS) != 0)) != 0);
+    __pyx_t_2 = __pyx_t_1;
+  } else {
+    __pyx_t_2 = __pyx_t_3;
+  }
+  if (__pyx_t_2) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":219
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")             # <<<<<<<<<<<<<<
+ * 
+ *             info.buf = PyArray_DATA(self)
+ */
+    __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__13, NULL); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 219; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_4);
+    __Pyx_Raise(__pyx_t_4, 0, 0, 0);
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 219; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  }
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":221
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")
+ * 
+ *             info.buf = PyArray_DATA(self)             # <<<<<<<<<<<<<<
+ *             info.ndim = ndim
+ *             if copy_shape:
+ */
+  __pyx_v_info->buf = PyArray_DATA(__pyx_v_self);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":222
+ * 
+ *             info.buf = PyArray_DATA(self)
+ *             info.ndim = ndim             # <<<<<<<<<<<<<<
+ *             if copy_shape:
+ *                 # Allocate new buffer for strides and shape info.
+ */
+  __pyx_v_info->ndim = __pyx_v_ndim;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":223
+ *             info.buf = PyArray_DATA(self)
+ *             info.ndim = ndim
+ *             if copy_shape:             # <<<<<<<<<<<<<<
+ *                 # Allocate new buffer for strides and shape info.
+ *                 # This is allocated as one block, strides first.
+ */
+  __pyx_t_2 = (__pyx_v_copy_shape != 0);
+  if (__pyx_t_2) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":226
+ *                 # Allocate new buffer for strides and shape info.
+ *                 # This is allocated as one block, strides first.
+ *                 info.strides = <Py_ssize_t*>stdlib.malloc(sizeof(Py_ssize_t) * <size_t>ndim * 2)             # <<<<<<<<<<<<<<
+ *                 info.shape = info.strides + ndim
+ *                 for i in range(ndim):
+ */
+    __pyx_v_info->strides = ((Py_ssize_t *)malloc((((sizeof(Py_ssize_t)) * ((size_t)__pyx_v_ndim)) * 2)));
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":227
+ *                 # This is allocated as one block, strides first.
+ *                 info.strides = <Py_ssize_t*>stdlib.malloc(sizeof(Py_ssize_t) * <size_t>ndim * 2)
+ *                 info.shape = info.strides + ndim             # <<<<<<<<<<<<<<
+ *                 for i in range(ndim):
+ *                     info.strides[i] = PyArray_STRIDES(self)[i]
+ */
+    __pyx_v_info->shape = (__pyx_v_info->strides + __pyx_v_ndim);
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":228
+ *                 info.strides = <Py_ssize_t*>stdlib.malloc(sizeof(Py_ssize_t) * <size_t>ndim * 2)
+ *                 info.shape = info.strides + ndim
+ *                 for i in range(ndim):             # <<<<<<<<<<<<<<
+ *                     info.strides[i] = PyArray_STRIDES(self)[i]
+ *                     info.shape[i] = PyArray_DIMS(self)[i]
+ */
+    __pyx_t_5 = __pyx_v_ndim;
+    for (__pyx_t_6 = 0; __pyx_t_6 < __pyx_t_5; __pyx_t_6+=1) {
+      __pyx_v_i = __pyx_t_6;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":229
+ *                 info.shape = info.strides + ndim
+ *                 for i in range(ndim):
+ *                     info.strides[i] = PyArray_STRIDES(self)[i]             # <<<<<<<<<<<<<<
+ *                     info.shape[i] = PyArray_DIMS(self)[i]
+ *             else:
+ */
+      (__pyx_v_info->strides[__pyx_v_i]) = (PyArray_STRIDES(__pyx_v_self)[__pyx_v_i]);
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":230
+ *                 for i in range(ndim):
+ *                     info.strides[i] = PyArray_STRIDES(self)[i]
+ *                     info.shape[i] = PyArray_DIMS(self)[i]             # <<<<<<<<<<<<<<
+ *             else:
+ *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)
+ */
+      (__pyx_v_info->shape[__pyx_v_i]) = (PyArray_DIMS(__pyx_v_self)[__pyx_v_i]);
+    }
+    goto __pyx_L7;
+  }
+  /*else*/ {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":232
+ *                     info.shape[i] = PyArray_DIMS(self)[i]
+ *             else:
+ *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)             # <<<<<<<<<<<<<<
+ *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)
+ *             info.suboffsets = NULL
+ */
+    __pyx_v_info->strides = ((Py_ssize_t *)PyArray_STRIDES(__pyx_v_self));
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":233
+ *             else:
+ *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)
+ *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)             # <<<<<<<<<<<<<<
+ *             info.suboffsets = NULL
+ *             info.itemsize = PyArray_ITEMSIZE(self)
+ */
+    __pyx_v_info->shape = ((Py_ssize_t *)PyArray_DIMS(__pyx_v_self));
+  }
+  __pyx_L7:;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":234
+ *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)
+ *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)
+ *             info.suboffsets = NULL             # <<<<<<<<<<<<<<
+ *             info.itemsize = PyArray_ITEMSIZE(self)
+ *             info.readonly = not PyArray_ISWRITEABLE(self)
+ */
+  __pyx_v_info->suboffsets = NULL;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":235
+ *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)
+ *             info.suboffsets = NULL
+ *             info.itemsize = PyArray_ITEMSIZE(self)             # <<<<<<<<<<<<<<
+ *             info.readonly = not PyArray_ISWRITEABLE(self)
+ * 
+ */
+  __pyx_v_info->itemsize = PyArray_ITEMSIZE(__pyx_v_self);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":236
+ *             info.suboffsets = NULL
+ *             info.itemsize = PyArray_ITEMSIZE(self)
+ *             info.readonly = not PyArray_ISWRITEABLE(self)             # <<<<<<<<<<<<<<
+ * 
+ *             cdef int t
+ */
+  __pyx_v_info->readonly = (!(PyArray_ISWRITEABLE(__pyx_v_self) != 0));
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":239
+ * 
+ *             cdef int t
+ *             cdef char* f = NULL             # <<<<<<<<<<<<<<
+ *             cdef dtype descr = self.descr
+ *             cdef list stack
+ */
+  __pyx_v_f = NULL;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":240
+ *             cdef int t
+ *             cdef char* f = NULL
+ *             cdef dtype descr = self.descr             # <<<<<<<<<<<<<<
+ *             cdef list stack
+ *             cdef int offset
+ */
+  __pyx_t_4 = ((PyObject *)__pyx_v_self->descr);
+  __Pyx_INCREF(__pyx_t_4);
+  __pyx_v_descr = ((PyArray_Descr *)__pyx_t_4);
+  __pyx_t_4 = 0;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":244
+ *             cdef int offset
+ * 
+ *             cdef bint hasfields = PyDataType_HASFIELDS(descr)             # <<<<<<<<<<<<<<
+ * 
+ *             if not hasfields and not copy_shape:
+ */
+  __pyx_v_hasfields = PyDataType_HASFIELDS(__pyx_v_descr);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":246
+ *             cdef bint hasfields = PyDataType_HASFIELDS(descr)
+ * 
+ *             if not hasfields and not copy_shape:             # <<<<<<<<<<<<<<
+ *                 # do not call releasebuffer
+ *                 info.obj = None
+ */
+  __pyx_t_2 = ((!(__pyx_v_hasfields != 0)) != 0);
+  if (__pyx_t_2) {
+    __pyx_t_3 = ((!(__pyx_v_copy_shape != 0)) != 0);
+    __pyx_t_1 = __pyx_t_3;
+  } else {
+    __pyx_t_1 = __pyx_t_2;
+  }
+  if (__pyx_t_1) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":248
+ *             if not hasfields and not copy_shape:
+ *                 # do not call releasebuffer
+ *                 info.obj = None             # <<<<<<<<<<<<<<
+ *             else:
+ *                 # need to call releasebuffer
+ */
+    __Pyx_INCREF(Py_None);
+    __Pyx_GIVEREF(Py_None);
+    __Pyx_GOTREF(__pyx_v_info->obj);
+    __Pyx_DECREF(__pyx_v_info->obj);
+    __pyx_v_info->obj = Py_None;
+    goto __pyx_L10;
+  }
+  /*else*/ {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":251
+ *             else:
+ *                 # need to call releasebuffer
+ *                 info.obj = self             # <<<<<<<<<<<<<<
+ * 
+ *             if not hasfields:
+ */
+    __Pyx_INCREF(((PyObject *)__pyx_v_self));
+    __Pyx_GIVEREF(((PyObject *)__pyx_v_self));
+    __Pyx_GOTREF(__pyx_v_info->obj);
+    __Pyx_DECREF(__pyx_v_info->obj);
+    __pyx_v_info->obj = ((PyObject *)__pyx_v_self);
+  }
+  __pyx_L10:;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":253
+ *                 info.obj = self
+ * 
+ *             if not hasfields:             # <<<<<<<<<<<<<<
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ */
+  __pyx_t_1 = ((!(__pyx_v_hasfields != 0)) != 0);
+  if (__pyx_t_1) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":254
+ * 
+ *             if not hasfields:
+ *                 t = descr.type_num             # <<<<<<<<<<<<<<
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ */
+    __pyx_t_5 = __pyx_v_descr->type_num;
+    __pyx_v_t = __pyx_t_5;
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":255
+ *             if not hasfields:
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")
+ */
+    __pyx_t_1 = ((__pyx_v_descr->byteorder == '>') != 0);
+    if (__pyx_t_1) {
+      __pyx_t_2 = (__pyx_v_little_endian != 0);
+    } else {
+      __pyx_t_2 = __pyx_t_1;
+    }
+    if (!__pyx_t_2) {
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":256
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ *                     (descr.byteorder == c'<' and not little_endian)):             # <<<<<<<<<<<<<<
+ *                     raise ValueError(u"Non-native byte order not supported")
+ *                 if   t == NPY_BYTE:        f = "b"
+ */
+      __pyx_t_1 = ((__pyx_v_descr->byteorder == '<') != 0);
+      if (__pyx_t_1) {
+        __pyx_t_3 = ((!(__pyx_v_little_endian != 0)) != 0);
+        __pyx_t_7 = __pyx_t_3;
+      } else {
+        __pyx_t_7 = __pyx_t_1;
+      }
+      __pyx_t_1 = __pyx_t_7;
+    } else {
+      __pyx_t_1 = __pyx_t_2;
+    }
+    if (__pyx_t_1) {
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":257
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
+ *                 if   t == NPY_BYTE:        f = "b"
+ *                 elif t == NPY_UBYTE:       f = "B"
+ */
+      __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__14, NULL); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 257; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_Raise(__pyx_t_4, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      {__pyx_filename = __pyx_f[1]; __pyx_lineno = 257; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    }
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":274
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"
+ *                 elif t == NPY_CLONGDOUBLE: f = "Zg"
+ *                 elif t == NPY_OBJECT:      f = "O"             # <<<<<<<<<<<<<<
+ *                 else:
+ *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ */
+    switch (__pyx_v_t) {
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":258
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")
+ *                 if   t == NPY_BYTE:        f = "b"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_UBYTE:       f = "B"
+ *                 elif t == NPY_SHORT:       f = "h"
+ */
+      case NPY_BYTE:
+      __pyx_v_f = __pyx_k_b;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":259
+ *                     raise ValueError(u"Non-native byte order not supported")
+ *                 if   t == NPY_BYTE:        f = "b"
+ *                 elif t == NPY_UBYTE:       f = "B"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_SHORT:       f = "h"
+ *                 elif t == NPY_USHORT:      f = "H"
+ */
+      case NPY_UBYTE:
+      __pyx_v_f = __pyx_k_B;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":260
+ *                 if   t == NPY_BYTE:        f = "b"
+ *                 elif t == NPY_UBYTE:       f = "B"
+ *                 elif t == NPY_SHORT:       f = "h"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_USHORT:      f = "H"
+ *                 elif t == NPY_INT:         f = "i"
+ */
+      case NPY_SHORT:
+      __pyx_v_f = __pyx_k_h;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":261
+ *                 elif t == NPY_UBYTE:       f = "B"
+ *                 elif t == NPY_SHORT:       f = "h"
+ *                 elif t == NPY_USHORT:      f = "H"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_INT:         f = "i"
+ *                 elif t == NPY_UINT:        f = "I"
+ */
+      case NPY_USHORT:
+      __pyx_v_f = __pyx_k_H;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":262
+ *                 elif t == NPY_SHORT:       f = "h"
+ *                 elif t == NPY_USHORT:      f = "H"
+ *                 elif t == NPY_INT:         f = "i"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_UINT:        f = "I"
+ *                 elif t == NPY_LONG:        f = "l"
+ */
+      case NPY_INT:
+      __pyx_v_f = __pyx_k_i;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":263
+ *                 elif t == NPY_USHORT:      f = "H"
+ *                 elif t == NPY_INT:         f = "i"
+ *                 elif t == NPY_UINT:        f = "I"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_LONG:        f = "l"
+ *                 elif t == NPY_ULONG:       f = "L"
+ */
+      case NPY_UINT:
+      __pyx_v_f = __pyx_k_I;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":264
+ *                 elif t == NPY_INT:         f = "i"
+ *                 elif t == NPY_UINT:        f = "I"
+ *                 elif t == NPY_LONG:        f = "l"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_ULONG:       f = "L"
+ *                 elif t == NPY_LONGLONG:    f = "q"
+ */
+      case NPY_LONG:
+      __pyx_v_f = __pyx_k_l;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":265
+ *                 elif t == NPY_UINT:        f = "I"
+ *                 elif t == NPY_LONG:        f = "l"
+ *                 elif t == NPY_ULONG:       f = "L"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_LONGLONG:    f = "q"
+ *                 elif t == NPY_ULONGLONG:   f = "Q"
+ */
+      case NPY_ULONG:
+      __pyx_v_f = __pyx_k_L;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":266
+ *                 elif t == NPY_LONG:        f = "l"
+ *                 elif t == NPY_ULONG:       f = "L"
+ *                 elif t == NPY_LONGLONG:    f = "q"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_ULONGLONG:   f = "Q"
+ *                 elif t == NPY_FLOAT:       f = "f"
+ */
+      case NPY_LONGLONG:
+      __pyx_v_f = __pyx_k_q;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":267
+ *                 elif t == NPY_ULONG:       f = "L"
+ *                 elif t == NPY_LONGLONG:    f = "q"
+ *                 elif t == NPY_ULONGLONG:   f = "Q"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_FLOAT:       f = "f"
+ *                 elif t == NPY_DOUBLE:      f = "d"
+ */
+      case NPY_ULONGLONG:
+      __pyx_v_f = __pyx_k_Q;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":268
+ *                 elif t == NPY_LONGLONG:    f = "q"
+ *                 elif t == NPY_ULONGLONG:   f = "Q"
+ *                 elif t == NPY_FLOAT:       f = "f"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_DOUBLE:      f = "d"
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"
+ */
+      case NPY_FLOAT:
+      __pyx_v_f = __pyx_k_f;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":269
+ *                 elif t == NPY_ULONGLONG:   f = "Q"
+ *                 elif t == NPY_FLOAT:       f = "f"
+ *                 elif t == NPY_DOUBLE:      f = "d"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"
+ *                 elif t == NPY_CFLOAT:      f = "Zf"
+ */
+      case NPY_DOUBLE:
+      __pyx_v_f = __pyx_k_d;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":270
+ *                 elif t == NPY_FLOAT:       f = "f"
+ *                 elif t == NPY_DOUBLE:      f = "d"
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_CFLOAT:      f = "Zf"
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"
+ */
+      case NPY_LONGDOUBLE:
+      __pyx_v_f = __pyx_k_g;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":271
+ *                 elif t == NPY_DOUBLE:      f = "d"
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"
+ *                 elif t == NPY_CFLOAT:      f = "Zf"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"
+ *                 elif t == NPY_CLONGDOUBLE: f = "Zg"
+ */
+      case NPY_CFLOAT:
+      __pyx_v_f = __pyx_k_Zf;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":272
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"
+ *                 elif t == NPY_CFLOAT:      f = "Zf"
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_CLONGDOUBLE: f = "Zg"
+ *                 elif t == NPY_OBJECT:      f = "O"
+ */
+      case NPY_CDOUBLE:
+      __pyx_v_f = __pyx_k_Zd;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":273
+ *                 elif t == NPY_CFLOAT:      f = "Zf"
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"
+ *                 elif t == NPY_CLONGDOUBLE: f = "Zg"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_OBJECT:      f = "O"
+ *                 else:
+ */
+      case NPY_CLONGDOUBLE:
+      __pyx_v_f = __pyx_k_Zg;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":274
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"
+ *                 elif t == NPY_CLONGDOUBLE: f = "Zg"
+ *                 elif t == NPY_OBJECT:      f = "O"             # <<<<<<<<<<<<<<
+ *                 else:
+ *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ */
+      case NPY_OBJECT:
+      __pyx_v_f = __pyx_k_O;
+      break;
+      default:
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":276
+ *                 elif t == NPY_OBJECT:      f = "O"
+ *                 else:
+ *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)             # <<<<<<<<<<<<<<
+ *                 info.format = f
+ *                 return
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_int(__pyx_v_t); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 276; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_8 = PyUnicode_Format(__pyx_kp_u_unknown_dtype_code_in_numpy_pxd, __pyx_t_4); if (unlikely(!__pyx_t_8)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 276; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_4 = PyTuple_New(1); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 276; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_8);
+      __Pyx_GIVEREF(__pyx_t_8);
+      __pyx_t_8 = 0;
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_4, NULL); if (unlikely(!__pyx_t_8)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 276; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      {__pyx_filename = __pyx_f[1]; __pyx_lineno = 276; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      break;
+    }
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":277
+ *                 else:
+ *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ *                 info.format = f             # <<<<<<<<<<<<<<
+ *                 return
+ *             else:
+ */
+    __pyx_v_info->format = __pyx_v_f;
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":278
+ *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ *                 info.format = f
+ *                 return             # <<<<<<<<<<<<<<
+ *             else:
+ *                 info.format = <char*>stdlib.malloc(_buffer_format_string_len)
+ */
+    __pyx_r = 0;
+    goto __pyx_L0;
+  }
+  /*else*/ {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":280
+ *                 return
+ *             else:
+ *                 info.format = <char*>stdlib.malloc(_buffer_format_string_len)             # <<<<<<<<<<<<<<
+ *                 info.format[0] = c'^' # Native data types, manual alignment
+ *                 offset = 0
+ */
+    __pyx_v_info->format = ((char *)malloc(255));
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":281
+ *             else:
+ *                 info.format = <char*>stdlib.malloc(_buffer_format_string_len)
+ *                 info.format[0] = c'^' # Native data types, manual alignment             # <<<<<<<<<<<<<<
+ *                 offset = 0
+ *                 f = _util_dtypestring(descr, info.format + 1,
+ */
+    (__pyx_v_info->format[0]) = '^';
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":282
+ *                 info.format = <char*>stdlib.malloc(_buffer_format_string_len)
+ *                 info.format[0] = c'^' # Native data types, manual alignment
+ *                 offset = 0             # <<<<<<<<<<<<<<
+ *                 f = _util_dtypestring(descr, info.format + 1,
+ *                                       info.format + _buffer_format_string_len,
+ */
+    __pyx_v_offset = 0;
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":283
+ *                 info.format[0] = c'^' # Native data types, manual alignment
+ *                 offset = 0
+ *                 f = _util_dtypestring(descr, info.format + 1,             # <<<<<<<<<<<<<<
+ *                                       info.format + _buffer_format_string_len,
+ *                                       &offset)
+ */
+    __pyx_t_9 = __pyx_f_5numpy__util_dtypestring(__pyx_v_descr, (__pyx_v_info->format + 1), (__pyx_v_info->format + 255), (&__pyx_v_offset)); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 283; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_v_f = __pyx_t_9;
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":286
+ *                                       info.format + _buffer_format_string_len,
+ *                                       &offset)
+ *                 f[0] = c'\0' # Terminate format string             # <<<<<<<<<<<<<<
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):
+ */
+    (__pyx_v_f[0]) = '\x00';
+  }
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":194
+ *         # experimental exception made for __getbuffer__ and __releasebuffer__
+ *         # -- the details of this may change.
+ *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
+ *             # This implementation of getbuffer is geared towards Cython
+ *             # requirements, and does not yet fullfill the PEP.
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.ndarray.__getbuffer__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  if (__pyx_v_info != NULL && __pyx_v_info->obj != NULL) {
+    __Pyx_GOTREF(__pyx_v_info->obj);
+    __Pyx_DECREF(__pyx_v_info->obj); __pyx_v_info->obj = NULL;
+  }
+  goto __pyx_L2;
+  __pyx_L0:;
+  if (__pyx_v_info != NULL && __pyx_v_info->obj == Py_None) {
+    __Pyx_GOTREF(Py_None);
+    __Pyx_DECREF(Py_None); __pyx_v_info->obj = NULL;
+  }
+  __pyx_L2:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_descr);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":288
+ *                 f[0] = c'\0' # Terminate format string
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):             # <<<<<<<<<<<<<<
+ *             if PyArray_HASFIELDS(self):
+ *                 stdlib.free(info.format)
+ */
+
+/* Python wrapper */
+static CYTHON_UNUSED void __pyx_pw_5numpy_7ndarray_3__releasebuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info); /*proto*/
+static CYTHON_UNUSED void __pyx_pw_5numpy_7ndarray_3__releasebuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__releasebuffer__ (wrapper)", 0);
+  __pyx_pf_5numpy_7ndarray_2__releasebuffer__(((PyArrayObject *)__pyx_v_self), ((Py_buffer *)__pyx_v_info));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info) {
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  __Pyx_RefNannySetupContext("__releasebuffer__", 0);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":289
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):
+ *             if PyArray_HASFIELDS(self):             # <<<<<<<<<<<<<<
+ *                 stdlib.free(info.format)
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ */
+  __pyx_t_1 = (PyArray_HASFIELDS(__pyx_v_self) != 0);
+  if (__pyx_t_1) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":290
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):
+ *             if PyArray_HASFIELDS(self):
+ *                 stdlib.free(info.format)             # <<<<<<<<<<<<<<
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ *                 stdlib.free(info.strides)
+ */
+    free(__pyx_v_info->format);
+    goto __pyx_L3;
+  }
+  __pyx_L3:;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":291
+ *             if PyArray_HASFIELDS(self):
+ *                 stdlib.free(info.format)
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):             # <<<<<<<<<<<<<<
+ *                 stdlib.free(info.strides)
+ *                 # info.shape was stored after info.strides in the same block
+ */
+  __pyx_t_1 = (((sizeof(npy_intp)) != (sizeof(Py_ssize_t))) != 0);
+  if (__pyx_t_1) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":292
+ *                 stdlib.free(info.format)
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ *                 stdlib.free(info.strides)             # <<<<<<<<<<<<<<
+ *                 # info.shape was stored after info.strides in the same block
+ * 
+ */
+    free(__pyx_v_info->strides);
+    goto __pyx_L4;
+  }
+  __pyx_L4:;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":288
+ *                 f[0] = c'\0' # Terminate format string
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):             # <<<<<<<<<<<<<<
+ *             if PyArray_HASFIELDS(self):
+ *                 stdlib.free(info.format)
+ */
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":768
+ * ctypedef npy_cdouble     complex_t
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew1(PyObject *__pyx_v_a) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew1", 0);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":769
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):
+ *     return PyArray_MultiIterNew(1, <void*>a)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(1, ((void *)__pyx_v_a)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 769; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":768
+ * ctypedef npy_cdouble     complex_t
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew1", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":771
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew2(PyObject *__pyx_v_a, PyObject *__pyx_v_b) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew2", 0);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":772
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(2, ((void *)__pyx_v_a), ((void *)__pyx_v_b)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 772; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":771
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew2", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":774
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew3(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew3", 0);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":775
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(3, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 775; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":774
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew3", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":777
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew4(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c, PyObject *__pyx_v_d) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew4", 0);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":778
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(4, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 778; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":777
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew4", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":780
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew5(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c, PyObject *__pyx_v_d, PyObject *__pyx_v_e) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew5", 0);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":781
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL:
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(5, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d), ((void *)__pyx_v_e)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 781; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":780
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew5", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":783
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL:             # <<<<<<<<<<<<<<
+ *     # Recursive utility function used in __getbuffer__ to get format
+ *     # string. The new location in the format string is returned.
+ */
+
+static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx_v_descr, char *__pyx_v_f, char *__pyx_v_end, int *__pyx_v_offset) {
+  PyArray_Descr *__pyx_v_child = 0;
+  int __pyx_v_endian_detector;
+  int __pyx_v_little_endian;
+  PyObject *__pyx_v_fields = 0;
+  PyObject *__pyx_v_childname = NULL;
+  PyObject *__pyx_v_new_offset = NULL;
+  PyObject *__pyx_v_t = NULL;
+  char *__pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  Py_ssize_t __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  int __pyx_t_5;
+  int __pyx_t_6;
+  int __pyx_t_7;
+  int __pyx_t_8;
+  int __pyx_t_9;
+  long __pyx_t_10;
+  char *__pyx_t_11;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("_util_dtypestring", 0);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":790
+ *     cdef int delta_offset
+ *     cdef tuple i
+ *     cdef int endian_detector = 1             # <<<<<<<<<<<<<<
+ *     cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)
+ *     cdef tuple fields
+ */
+  __pyx_v_endian_detector = 1;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":791
+ *     cdef tuple i
+ *     cdef int endian_detector = 1
+ *     cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)             # <<<<<<<<<<<<<<
+ *     cdef tuple fields
+ * 
+ */
+  __pyx_v_little_endian = ((((char *)(&__pyx_v_endian_detector))[0]) != 0);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":794
+ *     cdef tuple fields
+ * 
+ *     for childname in descr.names:             # <<<<<<<<<<<<<<
+ *         fields = descr.fields[childname]
+ *         child, new_offset = fields
+ */
+  if (unlikely(__pyx_v_descr->names == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable");
+    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 794; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  }
+  __pyx_t_1 = __pyx_v_descr->names; __Pyx_INCREF(__pyx_t_1); __pyx_t_2 = 0;
+  for (;;) {
+    if (__pyx_t_2 >= PyTuple_GET_SIZE(__pyx_t_1)) break;
+    #if CYTHON_COMPILING_IN_CPYTHON
+    __pyx_t_3 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_2); __Pyx_INCREF(__pyx_t_3); __pyx_t_2++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 794; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    #else
+    __pyx_t_3 = PySequence_ITEM(__pyx_t_1, __pyx_t_2); __pyx_t_2++; if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 794; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    #endif
+    __Pyx_XDECREF_SET(__pyx_v_childname, __pyx_t_3);
+    __pyx_t_3 = 0;
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":795
+ * 
+ *     for childname in descr.names:
+ *         fields = descr.fields[childname]             # <<<<<<<<<<<<<<
+ *         child, new_offset = fields
+ * 
+ */
+    __pyx_t_3 = PyObject_GetItem(__pyx_v_descr->fields, __pyx_v_childname); if (unlikely(__pyx_t_3 == NULL)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 795; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+    __Pyx_GOTREF(__pyx_t_3);
+    if (!(likely(PyTuple_CheckExact(__pyx_t_3))||((__pyx_t_3) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "tuple", Py_TYPE(__pyx_t_3)->tp_name), 0))) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 795; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_XDECREF_SET(__pyx_v_fields, ((PyObject*)__pyx_t_3));
+    __pyx_t_3 = 0;
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":796
+ *     for childname in descr.names:
+ *         fields = descr.fields[childname]
+ *         child, new_offset = fields             # <<<<<<<<<<<<<<
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:
+ */
+    if (likely(__pyx_v_fields != Py_None)) {
+      PyObject* sequence = __pyx_v_fields;
+      #if CYTHON_COMPILING_IN_CPYTHON
+      Py_ssize_t size = Py_SIZE(sequence);
+      #else
+      Py_ssize_t size = PySequence_Size(sequence);
+      #endif
+      if (unlikely(size != 2)) {
+        if (size > 2) __Pyx_RaiseTooManyValuesError(2);
+        else if (size >= 0) __Pyx_RaiseNeedMoreValuesError(size);
+        {__pyx_filename = __pyx_f[1]; __pyx_lineno = 796; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      }
+      #if CYTHON_COMPILING_IN_CPYTHON
+      __pyx_t_3 = PyTuple_GET_ITEM(sequence, 0); 
+      __pyx_t_4 = PyTuple_GET_ITEM(sequence, 1); 
+      __Pyx_INCREF(__pyx_t_3);
+      __Pyx_INCREF(__pyx_t_4);
+      #else
+      __pyx_t_3 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 796; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 796; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      #endif
+    } else {
+      __Pyx_RaiseNoneNotIterableError(); {__pyx_filename = __pyx_f[1]; __pyx_lineno = 796; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    }
+    if (!(likely(((__pyx_t_3) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_3, __pyx_ptype_5numpy_dtype))))) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 796; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_XDECREF_SET(__pyx_v_child, ((PyArray_Descr *)__pyx_t_3));
+    __pyx_t_3 = 0;
+    __Pyx_XDECREF_SET(__pyx_v_new_offset, __pyx_t_4);
+    __pyx_t_4 = 0;
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":798
+ *         child, new_offset = fields
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:             # <<<<<<<<<<<<<<
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyInt_From_int((__pyx_v_offset[0])); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 798; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_4);
+    __pyx_t_3 = PyNumber_Subtract(__pyx_v_new_offset, __pyx_t_4); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 798; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __pyx_t_5 = __Pyx_PyInt_As_int(__pyx_t_3); if (unlikely((__pyx_t_5 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 798; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_t_6 = ((((__pyx_v_end - __pyx_v_f) - ((int)__pyx_t_5)) < 15) != 0);
+    if (__pyx_t_6) {
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":799
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")             # <<<<<<<<<<<<<<
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or
+ */
+      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, __pyx_tuple__15, NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 799; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      {__pyx_filename = __pyx_f[1]; __pyx_lineno = 799; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    }
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":801
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
+ *             (child.byteorder == c'<' and not little_endian)):
+ *             raise ValueError(u"Non-native byte order not supported")
+ */
+    __pyx_t_6 = ((__pyx_v_child->byteorder == '>') != 0);
+    if (__pyx_t_6) {
+      __pyx_t_7 = (__pyx_v_little_endian != 0);
+    } else {
+      __pyx_t_7 = __pyx_t_6;
+    }
+    if (!__pyx_t_7) {
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":802
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or
+ *             (child.byteorder == c'<' and not little_endian)):             # <<<<<<<<<<<<<<
+ *             raise ValueError(u"Non-native byte order not supported")
+ *             # One could encode it in the format string and have Cython
+ */
+      __pyx_t_6 = ((__pyx_v_child->byteorder == '<') != 0);
+      if (__pyx_t_6) {
+        __pyx_t_8 = ((!(__pyx_v_little_endian != 0)) != 0);
+        __pyx_t_9 = __pyx_t_8;
+      } else {
+        __pyx_t_9 = __pyx_t_6;
+      }
+      __pyx_t_6 = __pyx_t_9;
+    } else {
+      __pyx_t_6 = __pyx_t_7;
+    }
+    if (__pyx_t_6) {
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":803
+ *         if ((child.byteorder == c'>' and little_endian) or
+ *             (child.byteorder == c'<' and not little_endian)):
+ *             raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
+ *             # One could encode it in the format string and have Cython
+ *             # complain instead, BUT: < and > in format strings also imply
+ */
+      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__16, NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 803; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      {__pyx_filename = __pyx_f[1]; __pyx_lineno = 803; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    }
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":813
+ * 
+ *         # Output padding bytes
+ *         while offset[0] < new_offset:             # <<<<<<<<<<<<<<
+ *             f[0] = 120 # "x"; pad byte
+ *             f += 1
+ */
+    while (1) {
+      __pyx_t_3 = __Pyx_PyInt_From_int((__pyx_v_offset[0])); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 813; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_t_3, __pyx_v_new_offset, Py_LT); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 813; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 813; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (!__pyx_t_6) break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":814
+ *         # Output padding bytes
+ *         while offset[0] < new_offset:
+ *             f[0] = 120 # "x"; pad byte             # <<<<<<<<<<<<<<
+ *             f += 1
+ *             offset[0] += 1
+ */
+      (__pyx_v_f[0]) = 120;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":815
+ *         while offset[0] < new_offset:
+ *             f[0] = 120 # "x"; pad byte
+ *             f += 1             # <<<<<<<<<<<<<<
+ *             offset[0] += 1
+ * 
+ */
+      __pyx_v_f = (__pyx_v_f + 1);
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":816
+ *             f[0] = 120 # "x"; pad byte
+ *             f += 1
+ *             offset[0] += 1             # <<<<<<<<<<<<<<
+ * 
+ *         offset[0] += child.itemsize
+ */
+      __pyx_t_10 = 0;
+      (__pyx_v_offset[__pyx_t_10]) = ((__pyx_v_offset[__pyx_t_10]) + 1);
+    }
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":818
+ *             offset[0] += 1
+ * 
+ *         offset[0] += child.itemsize             # <<<<<<<<<<<<<<
+ * 
+ *         if not PyDataType_HASFIELDS(child):
+ */
+    __pyx_t_10 = 0;
+    (__pyx_v_offset[__pyx_t_10]) = ((__pyx_v_offset[__pyx_t_10]) + __pyx_v_child->elsize);
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":820
+ *         offset[0] += child.itemsize
+ * 
+ *         if not PyDataType_HASFIELDS(child):             # <<<<<<<<<<<<<<
+ *             t = child.type_num
+ *             if end - f < 5:
+ */
+    __pyx_t_6 = ((!(PyDataType_HASFIELDS(__pyx_v_child) != 0)) != 0);
+    if (__pyx_t_6) {
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":821
+ * 
+ *         if not PyDataType_HASFIELDS(child):
+ *             t = child.type_num             # <<<<<<<<<<<<<<
+ *             if end - f < 5:
+ *                 raise RuntimeError(u"Format string allocated too short.")
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_int(__pyx_v_child->type_num); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 821; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_XDECREF_SET(__pyx_v_t, __pyx_t_4);
+      __pyx_t_4 = 0;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":822
+ *         if not PyDataType_HASFIELDS(child):
+ *             t = child.type_num
+ *             if end - f < 5:             # <<<<<<<<<<<<<<
+ *                 raise RuntimeError(u"Format string allocated too short.")
+ * 
+ */
+      __pyx_t_6 = (((__pyx_v_end - __pyx_v_f) < 5) != 0);
+      if (__pyx_t_6) {
+
+        /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":823
+ *             t = child.type_num
+ *             if end - f < 5:
+ *                 raise RuntimeError(u"Format string allocated too short.")             # <<<<<<<<<<<<<<
+ * 
+ *             # Until ticket #99 is fixed, use integers to avoid warnings
+ */
+        __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, __pyx_tuple__17, NULL); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 823; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_4);
+        __Pyx_Raise(__pyx_t_4, 0, 0, 0);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+        {__pyx_filename = __pyx_f[1]; __pyx_lineno = 823; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":826
+ * 
+ *             # Until ticket #99 is fixed, use integers to avoid warnings
+ *             if   t == NPY_BYTE:        f[0] =  98 #"b"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_UBYTE:       f[0] =  66 #"B"
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"
+ */
+      __pyx_t_4 = PyInt_FromLong(NPY_BYTE); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 826; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 826; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 826; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 98;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":827
+ *             # Until ticket #99 is fixed, use integers to avoid warnings
+ *             if   t == NPY_BYTE:        f[0] =  98 #"b"
+ *             elif t == NPY_UBYTE:       f[0] =  66 #"B"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"
+ */
+      __pyx_t_3 = PyInt_FromLong(NPY_UBYTE); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 827; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 827; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 827; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 66;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":828
+ *             if   t == NPY_BYTE:        f[0] =  98 #"b"
+ *             elif t == NPY_UBYTE:       f[0] =  66 #"B"
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"
+ *             elif t == NPY_INT:         f[0] = 105 #"i"
+ */
+      __pyx_t_4 = PyInt_FromLong(NPY_SHORT); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 828; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 828; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 828; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 104;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":829
+ *             elif t == NPY_UBYTE:       f[0] =  66 #"B"
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_INT:         f[0] = 105 #"i"
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"
+ */
+      __pyx_t_3 = PyInt_FromLong(NPY_USHORT); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 829; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 829; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 829; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 72;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":830
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"
+ *             elif t == NPY_INT:         f[0] = 105 #"i"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"
+ */
+      __pyx_t_4 = PyInt_FromLong(NPY_INT); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 830; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 830; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 830; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 105;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":831
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"
+ *             elif t == NPY_INT:         f[0] = 105 #"i"
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"
+ */
+      __pyx_t_3 = PyInt_FromLong(NPY_UINT); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 831; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 831; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 831; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 73;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":832
+ *             elif t == NPY_INT:         f[0] = 105 #"i"
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
+ */
+      __pyx_t_4 = PyInt_FromLong(NPY_LONG); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 832; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 832; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 832; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 108;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":833
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
+ */
+      __pyx_t_3 = PyInt_FromLong(NPY_ULONG); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 833; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 833; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 833; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 76;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":834
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
+ */
+      __pyx_t_4 = PyInt_FromLong(NPY_LONGLONG); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 834; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 834; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 834; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 113;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":835
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
+ */
+      __pyx_t_3 = PyInt_FromLong(NPY_ULONGLONG); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 835; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 835; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 835; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 81;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":836
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
+ */
+      __pyx_t_4 = PyInt_FromLong(NPY_FLOAT); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 836; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 836; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 836; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 102;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":837
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
+ */
+      __pyx_t_3 = PyInt_FromLong(NPY_DOUBLE); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 837; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 837; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 837; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 100;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":838
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
+ */
+      __pyx_t_4 = PyInt_FromLong(NPY_LONGDOUBLE); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 838; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 838; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 838; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 103;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":839
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf             # <<<<<<<<<<<<<<
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
+ *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg
+ */
+      __pyx_t_3 = PyInt_FromLong(NPY_CFLOAT); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 839; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 839; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 839; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 90;
+        (__pyx_v_f[1]) = 102;
+        __pyx_v_f = (__pyx_v_f + 1);
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":840
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd             # <<<<<<<<<<<<<<
+ *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg
+ *             elif t == NPY_OBJECT:      f[0] = 79 #"O"
+ */
+      __pyx_t_4 = PyInt_FromLong(NPY_CDOUBLE); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 840; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 840; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 840; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 90;
+        (__pyx_v_f[1]) = 100;
+        __pyx_v_f = (__pyx_v_f + 1);
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":841
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
+ *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg             # <<<<<<<<<<<<<<
+ *             elif t == NPY_OBJECT:      f[0] = 79 #"O"
+ *             else:
+ */
+      __pyx_t_3 = PyInt_FromLong(NPY_CLONGDOUBLE); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 841; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 841; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 841; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 90;
+        (__pyx_v_f[1]) = 103;
+        __pyx_v_f = (__pyx_v_f + 1);
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":842
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
+ *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg
+ *             elif t == NPY_OBJECT:      f[0] = 79 #"O"             # <<<<<<<<<<<<<<
+ *             else:
+ *                 raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ */
+      __pyx_t_4 = PyInt_FromLong(NPY_OBJECT); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 842; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 842; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 842; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 79;
+        goto __pyx_L11;
+      }
+      /*else*/ {
+
+        /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":844
+ *             elif t == NPY_OBJECT:      f[0] = 79 #"O"
+ *             else:
+ *                 raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)             # <<<<<<<<<<<<<<
+ *             f += 1
+ *         else:
+ */
+        __pyx_t_3 = PyUnicode_Format(__pyx_kp_u_unknown_dtype_code_in_numpy_pxd, __pyx_v_t); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 844; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_3);
+        __pyx_t_4 = PyTuple_New(1); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 844; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_4);
+        PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3);
+        __Pyx_GIVEREF(__pyx_t_3);
+        __pyx_t_3 = 0;
+        __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_4, NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 844; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+        __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+        __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+        {__pyx_filename = __pyx_f[1]; __pyx_lineno = 844; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      }
+      __pyx_L11:;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":845
+ *             else:
+ *                 raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ *             f += 1             # <<<<<<<<<<<<<<
+ *         else:
+ *             # Cython ignores struct boundary information ("T{...}"),
+ */
+      __pyx_v_f = (__pyx_v_f + 1);
+      goto __pyx_L9;
+    }
+    /*else*/ {
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":849
+ *             # Cython ignores struct boundary information ("T{...}"),
+ *             # so don't output it
+ *             f = _util_dtypestring(child, f, end, offset)             # <<<<<<<<<<<<<<
+ *     return f
+ * 
+ */
+      __pyx_t_11 = __pyx_f_5numpy__util_dtypestring(__pyx_v_child, __pyx_v_f, __pyx_v_end, __pyx_v_offset); if (unlikely(__pyx_t_11 == NULL)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 849; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_v_f = __pyx_t_11;
+    }
+    __pyx_L9:;
+  }
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":850
+ *             # so don't output it
+ *             f = _util_dtypestring(child, f, end, offset)
+ *     return f             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = __pyx_v_f;
+  goto __pyx_L0;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":783
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL:             # <<<<<<<<<<<<<<
+ *     # Recursive utility function used in __getbuffer__ to get format
+ *     # string. The new location in the format string is returned.
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("numpy._util_dtypestring", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_child);
+  __Pyx_XDECREF(__pyx_v_fields);
+  __Pyx_XDECREF(__pyx_v_childname);
+  __Pyx_XDECREF(__pyx_v_new_offset);
+  __Pyx_XDECREF(__pyx_v_t);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":966
+ * 
+ * 
+ * cdef inline void set_array_base(ndarray arr, object base):             # <<<<<<<<<<<<<<
+ *      cdef PyObject* baseptr
+ *      if base is None:
+ */
+
+static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_arr, PyObject *__pyx_v_base) {
+  PyObject *__pyx_v_baseptr;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  int __pyx_t_2;
+  __Pyx_RefNannySetupContext("set_array_base", 0);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":968
+ * cdef inline void set_array_base(ndarray arr, object base):
+ *      cdef PyObject* baseptr
+ *      if base is None:             # <<<<<<<<<<<<<<
+ *          baseptr = NULL
+ *      else:
+ */
+  __pyx_t_1 = (__pyx_v_base == Py_None);
+  __pyx_t_2 = (__pyx_t_1 != 0);
+  if (__pyx_t_2) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":969
+ *      cdef PyObject* baseptr
+ *      if base is None:
+ *          baseptr = NULL             # <<<<<<<<<<<<<<
+ *      else:
+ *          Py_INCREF(base) # important to do this before decref below!
+ */
+    __pyx_v_baseptr = NULL;
+    goto __pyx_L3;
+  }
+  /*else*/ {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":971
+ *          baseptr = NULL
+ *      else:
+ *          Py_INCREF(base) # important to do this before decref below!             # <<<<<<<<<<<<<<
+ *          baseptr = <PyObject*>base
+ *      Py_XDECREF(arr.base)
+ */
+    Py_INCREF(__pyx_v_base);
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":972
+ *      else:
+ *          Py_INCREF(base) # important to do this before decref below!
+ *          baseptr = <PyObject*>base             # <<<<<<<<<<<<<<
+ *      Py_XDECREF(arr.base)
+ *      arr.base = baseptr
+ */
+    __pyx_v_baseptr = ((PyObject *)__pyx_v_base);
+  }
+  __pyx_L3:;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":973
+ *          Py_INCREF(base) # important to do this before decref below!
+ *          baseptr = <PyObject*>base
+ *      Py_XDECREF(arr.base)             # <<<<<<<<<<<<<<
+ *      arr.base = baseptr
+ * 
+ */
+  Py_XDECREF(__pyx_v_arr->base);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":974
+ *          baseptr = <PyObject*>base
+ *      Py_XDECREF(arr.base)
+ *      arr.base = baseptr             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object get_array_base(ndarray arr):
+ */
+  __pyx_v_arr->base = __pyx_v_baseptr;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":966
+ * 
+ * 
+ * cdef inline void set_array_base(ndarray arr, object base):             # <<<<<<<<<<<<<<
+ *      cdef PyObject* baseptr
+ *      if base is None:
+ */
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":976
+ *      arr.base = baseptr
+ * 
+ * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
+ *     if arr.base is NULL:
+ *         return None
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__pyx_v_arr) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  __Pyx_RefNannySetupContext("get_array_base", 0);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":977
+ * 
+ * cdef inline object get_array_base(ndarray arr):
+ *     if arr.base is NULL:             # <<<<<<<<<<<<<<
+ *         return None
+ *     else:
+ */
+  __pyx_t_1 = ((__pyx_v_arr->base == NULL) != 0);
+  if (__pyx_t_1) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":978
+ * cdef inline object get_array_base(ndarray arr):
+ *     if arr.base is NULL:
+ *         return None             # <<<<<<<<<<<<<<
+ *     else:
+ *         return <object>arr.base
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(Py_None);
+    __pyx_r = Py_None;
+    goto __pyx_L0;
+  }
+  /*else*/ {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":980
+ *         return None
+ *     else:
+ *         return <object>arr.base             # <<<<<<<<<<<<<<
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(((PyObject *)__pyx_v_arr->base));
+    __pyx_r = ((PyObject *)__pyx_v_arr->base);
+    goto __pyx_L0;
+  }
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":976
+ *      arr.base = baseptr
+ * 
+ * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
+ *     if arr.base is NULL:
+ *         return None
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyMethodDef __pyx_methods[] = {
+  {0, 0, 0, 0}
+};
+
+#if PY_MAJOR_VERSION >= 3
+static struct PyModuleDef __pyx_moduledef = {
+  #if PY_VERSION_HEX < 0x03020000
+    { PyObject_HEAD_INIT(NULL) NULL, 0, NULL },
+  #else
+    PyModuleDef_HEAD_INIT,
+  #endif
+    __Pyx_NAMESTR("cpu_nms"),
+    0, /* m_doc */
+    -1, /* m_size */
+    __pyx_methods /* m_methods */,
+    NULL, /* m_reload */
+    NULL, /* m_traverse */
+    NULL, /* m_clear */
+    NULL /* m_free */
+};
+#endif
+
+static __Pyx_StringTabEntry __pyx_string_tab[] = {
+  {&__pyx_kp_u_Format_string_allocated_too_shor, __pyx_k_Format_string_allocated_too_shor, sizeof(__pyx_k_Format_string_allocated_too_shor), 0, 1, 0, 0},
+  {&__pyx_kp_u_Format_string_allocated_too_shor_2, __pyx_k_Format_string_allocated_too_shor_2, sizeof(__pyx_k_Format_string_allocated_too_shor_2), 0, 1, 0, 0},
+  {&__pyx_kp_u_Non_native_byte_order_not_suppor, __pyx_k_Non_native_byte_order_not_suppor, sizeof(__pyx_k_Non_native_byte_order_not_suppor), 0, 1, 0, 0},
+  {&__pyx_n_s_RuntimeError, __pyx_k_RuntimeError, sizeof(__pyx_k_RuntimeError), 0, 0, 1, 1},
+  {&__pyx_n_s_ValueError, __pyx_k_ValueError, sizeof(__pyx_k_ValueError), 0, 0, 1, 1},
+  {&__pyx_n_s_areas, __pyx_k_areas, sizeof(__pyx_k_areas), 0, 0, 1, 1},
+  {&__pyx_n_s_argsort, __pyx_k_argsort, sizeof(__pyx_k_argsort), 0, 0, 1, 1},
+  {&__pyx_n_s_cpu_nms, __pyx_k_cpu_nms, sizeof(__pyx_k_cpu_nms), 0, 0, 1, 1},
+  {&__pyx_n_s_dets, __pyx_k_dets, sizeof(__pyx_k_dets), 0, 0, 1, 1},
+  {&__pyx_n_s_dtype, __pyx_k_dtype, sizeof(__pyx_k_dtype), 0, 0, 1, 1},
+  {&__pyx_n_s_h, __pyx_k_h, sizeof(__pyx_k_h), 0, 0, 1, 1},
+  {&__pyx_n_s_i, __pyx_k_i, sizeof(__pyx_k_i), 0, 0, 1, 1},
+  {&__pyx_n_s_i_2, __pyx_k_i_2, sizeof(__pyx_k_i_2), 0, 0, 1, 1},
+  {&__pyx_n_s_iarea, __pyx_k_iarea, sizeof(__pyx_k_iarea), 0, 0, 1, 1},
+  {&__pyx_n_s_import, __pyx_k_import, sizeof(__pyx_k_import), 0, 0, 1, 1},
+  {&__pyx_n_s_int, __pyx_k_int, sizeof(__pyx_k_int), 0, 0, 1, 1},
+  {&__pyx_n_s_inter, __pyx_k_inter, sizeof(__pyx_k_inter), 0, 0, 1, 1},
+  {&__pyx_n_s_ix1, __pyx_k_ix1, sizeof(__pyx_k_ix1), 0, 0, 1, 1},
+  {&__pyx_n_s_ix2, __pyx_k_ix2, sizeof(__pyx_k_ix2), 0, 0, 1, 1},
+  {&__pyx_n_s_iy1, __pyx_k_iy1, sizeof(__pyx_k_iy1), 0, 0, 1, 1},
+  {&__pyx_n_s_iy2, __pyx_k_iy2, sizeof(__pyx_k_iy2), 0, 0, 1, 1},
+  {&__pyx_n_s_j, __pyx_k_j, sizeof(__pyx_k_j), 0, 0, 1, 1},
+  {&__pyx_n_s_j_2, __pyx_k_j_2, sizeof(__pyx_k_j_2), 0, 0, 1, 1},
+  {&__pyx_n_s_keep, __pyx_k_keep, sizeof(__pyx_k_keep), 0, 0, 1, 1},
+  {&__pyx_n_s_main, __pyx_k_main, sizeof(__pyx_k_main), 0, 0, 1, 1},
+  {&__pyx_kp_u_ndarray_is_not_C_contiguous, __pyx_k_ndarray_is_not_C_contiguous, sizeof(__pyx_k_ndarray_is_not_C_contiguous), 0, 1, 0, 0},
+  {&__pyx_kp_u_ndarray_is_not_Fortran_contiguou, __pyx_k_ndarray_is_not_Fortran_contiguou, sizeof(__pyx_k_ndarray_is_not_Fortran_contiguou), 0, 1, 0, 0},
+  {&__pyx_n_s_ndets, __pyx_k_ndets, sizeof(__pyx_k_ndets), 0, 0, 1, 1},
+  {&__pyx_kp_s_nfs_yoda_xinleic_Inf_Code_Faste, __pyx_k_nfs_yoda_xinleic_Inf_Code_Faste, sizeof(__pyx_k_nfs_yoda_xinleic_Inf_Code_Faste), 0, 0, 1, 0},
+  {&__pyx_n_s_nms_cpu_nms, __pyx_k_nms_cpu_nms, sizeof(__pyx_k_nms_cpu_nms), 0, 0, 1, 1},
+  {&__pyx_n_s_np, __pyx_k_np, sizeof(__pyx_k_np), 0, 0, 1, 1},
+  {&__pyx_n_s_numpy, __pyx_k_numpy, sizeof(__pyx_k_numpy), 0, 0, 1, 1},
+  {&__pyx_n_s_order, __pyx_k_order, sizeof(__pyx_k_order), 0, 0, 1, 1},
+  {&__pyx_n_s_ovr, __pyx_k_ovr, sizeof(__pyx_k_ovr), 0, 0, 1, 1},
+  {&__pyx_n_s_pyx_getbuffer, __pyx_k_pyx_getbuffer, sizeof(__pyx_k_pyx_getbuffer), 0, 0, 1, 1},
+  {&__pyx_n_s_pyx_releasebuffer, __pyx_k_pyx_releasebuffer, sizeof(__pyx_k_pyx_releasebuffer), 0, 0, 1, 1},
+  {&__pyx_n_s_range, __pyx_k_range, sizeof(__pyx_k_range), 0, 0, 1, 1},
+  {&__pyx_n_s_scores, __pyx_k_scores, sizeof(__pyx_k_scores), 0, 0, 1, 1},
+  {&__pyx_n_s_suppressed, __pyx_k_suppressed, sizeof(__pyx_k_suppressed), 0, 0, 1, 1},
+  {&__pyx_n_s_test, __pyx_k_test, sizeof(__pyx_k_test), 0, 0, 1, 1},
+  {&__pyx_n_s_thresh, __pyx_k_thresh, sizeof(__pyx_k_thresh), 0, 0, 1, 1},
+  {&__pyx_kp_u_unknown_dtype_code_in_numpy_pxd, __pyx_k_unknown_dtype_code_in_numpy_pxd, sizeof(__pyx_k_unknown_dtype_code_in_numpy_pxd), 0, 1, 0, 0},
+  {&__pyx_n_s_w, __pyx_k_w, sizeof(__pyx_k_w), 0, 0, 1, 1},
+  {&__pyx_n_s_x1, __pyx_k_x1, sizeof(__pyx_k_x1), 0, 0, 1, 1},
+  {&__pyx_n_s_x2, __pyx_k_x2, sizeof(__pyx_k_x2), 0, 0, 1, 1},
+  {&__pyx_n_s_xx1, __pyx_k_xx1, sizeof(__pyx_k_xx1), 0, 0, 1, 1},
+  {&__pyx_n_s_xx2, __pyx_k_xx2, sizeof(__pyx_k_xx2), 0, 0, 1, 1},
+  {&__pyx_n_s_y1, __pyx_k_y1, sizeof(__pyx_k_y1), 0, 0, 1, 1},
+  {&__pyx_n_s_y2, __pyx_k_y2, sizeof(__pyx_k_y2), 0, 0, 1, 1},
+  {&__pyx_n_s_yy1, __pyx_k_yy1, sizeof(__pyx_k_yy1), 0, 0, 1, 1},
+  {&__pyx_n_s_yy2, __pyx_k_yy2, sizeof(__pyx_k_yy2), 0, 0, 1, 1},
+  {&__pyx_n_s_zeros, __pyx_k_zeros, sizeof(__pyx_k_zeros), 0, 0, 1, 1},
+  {0, 0, 0, 0, 0, 0, 0}
+};
+static int __Pyx_InitCachedBuiltins(void) {
+  __pyx_builtin_range = __Pyx_GetBuiltinName(__pyx_n_s_range); if (!__pyx_builtin_range) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 43; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_builtin_ValueError = __Pyx_GetBuiltinName(__pyx_n_s_ValueError); if (!__pyx_builtin_ValueError) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 215; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_builtin_RuntimeError = __Pyx_GetBuiltinName(__pyx_n_s_RuntimeError); if (!__pyx_builtin_RuntimeError) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 799; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  return 0;
+  __pyx_L1_error:;
+  return -1;
+}
+
+static int __Pyx_InitCachedConstants(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0);
+
+  /* "nms/cpu_nms.pyx":18
+ * 
+ * def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
+ *     cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
+ *     cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
+ */
+  __pyx_slice_ = PySlice_New(Py_None, Py_None, Py_None); if (unlikely(!__pyx_slice_)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_slice_);
+  __Pyx_GIVEREF(__pyx_slice_);
+  __pyx_tuple__2 = PyTuple_Pack(2, __pyx_slice_, __pyx_int_0); if (unlikely(!__pyx_tuple__2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 18; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__2);
+  __Pyx_GIVEREF(__pyx_tuple__2);
+
+  /* "nms/cpu_nms.pyx":19
+ * def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
+ *     cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
+ *     cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
+ *     cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
+ */
+  __pyx_slice__3 = PySlice_New(Py_None, Py_None, Py_None); if (unlikely(!__pyx_slice__3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_slice__3);
+  __Pyx_GIVEREF(__pyx_slice__3);
+  __pyx_tuple__4 = PyTuple_Pack(2, __pyx_slice__3, __pyx_int_1); if (unlikely(!__pyx_tuple__4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 19; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__4);
+  __Pyx_GIVEREF(__pyx_tuple__4);
+
+  /* "nms/cpu_nms.pyx":20
+ *     cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
+ *     cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
+ *     cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
+ *     cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
+ */
+  __pyx_slice__5 = PySlice_New(Py_None, Py_None, Py_None); if (unlikely(!__pyx_slice__5)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_slice__5);
+  __Pyx_GIVEREF(__pyx_slice__5);
+  __pyx_tuple__6 = PyTuple_Pack(2, __pyx_slice__5, __pyx_int_2); if (unlikely(!__pyx_tuple__6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 20; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__6);
+  __Pyx_GIVEREF(__pyx_tuple__6);
+
+  /* "nms/cpu_nms.pyx":21
+ *     cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
+ *     cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
+ *     cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
+ * 
+ */
+  __pyx_slice__7 = PySlice_New(Py_None, Py_None, Py_None); if (unlikely(!__pyx_slice__7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_slice__7);
+  __Pyx_GIVEREF(__pyx_slice__7);
+  __pyx_tuple__8 = PyTuple_Pack(2, __pyx_slice__7, __pyx_int_3); if (unlikely(!__pyx_tuple__8)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__8);
+  __Pyx_GIVEREF(__pyx_tuple__8);
+
+  /* "nms/cpu_nms.pyx":22
+ *     cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
+ *     cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
+ *     cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]             # <<<<<<<<<<<<<<
+ * 
+ *     cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+ */
+  __pyx_slice__9 = PySlice_New(Py_None, Py_None, Py_None); if (unlikely(!__pyx_slice__9)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_slice__9);
+  __Pyx_GIVEREF(__pyx_slice__9);
+  __pyx_tuple__10 = PyTuple_Pack(2, __pyx_slice__9, __pyx_int_4); if (unlikely(!__pyx_tuple__10)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__10);
+  __Pyx_GIVEREF(__pyx_tuple__10);
+
+  /* "nms/cpu_nms.pyx":25
+ * 
+ *     cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+ *     cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]             # <<<<<<<<<<<<<<
+ * 
+ *     cdef int ndets = dets.shape[0]
+ */
+  __pyx_slice__11 = PySlice_New(Py_None, Py_None, __pyx_int_neg_1); if (unlikely(!__pyx_slice__11)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 25; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_slice__11);
+  __Pyx_GIVEREF(__pyx_slice__11);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":215
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not C contiguous")             # <<<<<<<<<<<<<<
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ */
+  __pyx_tuple__12 = PyTuple_Pack(1, __pyx_kp_u_ndarray_is_not_C_contiguous); if (unlikely(!__pyx_tuple__12)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 215; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__12);
+  __Pyx_GIVEREF(__pyx_tuple__12);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":219
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")             # <<<<<<<<<<<<<<
+ * 
+ *             info.buf = PyArray_DATA(self)
+ */
+  __pyx_tuple__13 = PyTuple_Pack(1, __pyx_kp_u_ndarray_is_not_Fortran_contiguou); if (unlikely(!__pyx_tuple__13)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 219; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__13);
+  __Pyx_GIVEREF(__pyx_tuple__13);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":257
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
+ *                 if   t == NPY_BYTE:        f = "b"
+ *                 elif t == NPY_UBYTE:       f = "B"
+ */
+  __pyx_tuple__14 = PyTuple_Pack(1, __pyx_kp_u_Non_native_byte_order_not_suppor); if (unlikely(!__pyx_tuple__14)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 257; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__14);
+  __Pyx_GIVEREF(__pyx_tuple__14);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":799
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")             # <<<<<<<<<<<<<<
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or
+ */
+  __pyx_tuple__15 = PyTuple_Pack(1, __pyx_kp_u_Format_string_allocated_too_shor); if (unlikely(!__pyx_tuple__15)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 799; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__15);
+  __Pyx_GIVEREF(__pyx_tuple__15);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":803
+ *         if ((child.byteorder == c'>' and little_endian) or
+ *             (child.byteorder == c'<' and not little_endian)):
+ *             raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
+ *             # One could encode it in the format string and have Cython
+ *             # complain instead, BUT: < and > in format strings also imply
+ */
+  __pyx_tuple__16 = PyTuple_Pack(1, __pyx_kp_u_Non_native_byte_order_not_suppor); if (unlikely(!__pyx_tuple__16)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 803; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__16);
+  __Pyx_GIVEREF(__pyx_tuple__16);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":823
+ *             t = child.type_num
+ *             if end - f < 5:
+ *                 raise RuntimeError(u"Format string allocated too short.")             # <<<<<<<<<<<<<<
+ * 
+ *             # Until ticket #99 is fixed, use integers to avoid warnings
+ */
+  __pyx_tuple__17 = PyTuple_Pack(1, __pyx_kp_u_Format_string_allocated_too_shor_2); if (unlikely(!__pyx_tuple__17)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 823; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__17);
+  __Pyx_GIVEREF(__pyx_tuple__17);
+
+  /* "nms/cpu_nms.pyx":17
+ *     return a if a <= b else b
+ * 
+ * def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
+ *     cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
+ */
+  __pyx_tuple__18 = PyTuple_Pack(29, __pyx_n_s_dets, __pyx_n_s_thresh, __pyx_n_s_x1, __pyx_n_s_y1, __pyx_n_s_x2, __pyx_n_s_y2, __pyx_n_s_scores, __pyx_n_s_areas, __pyx_n_s_order, __pyx_n_s_ndets, __pyx_n_s_suppressed, __pyx_n_s_i_2, __pyx_n_s_j, __pyx_n_s_i, __pyx_n_s_j_2, __pyx_n_s_ix1, __pyx_n_s_iy1, __pyx_n_s_ix2, __pyx_n_s_iy2, __pyx_n_s_iarea, __pyx_n_s_xx1, __pyx_n_s_yy1, __pyx_n_s_xx2, __pyx_n_s_yy2, __pyx_n_s_w, __pyx_n_s_h, __pyx_n_s_inter, __pyx_n_s_ovr, __pyx_n_s_keep); if (unlikely(!__pyx_tuple__18)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__18);
+  __Pyx_GIVEREF(__pyx_tuple__18);
+  __pyx_codeobj__19 = (PyObject*)__Pyx_PyCode_New(2, 0, 29, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__18, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_nfs_yoda_xinleic_Inf_Code_Faste, __pyx_n_s_cpu_nms, 17, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__19)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_RefNannyFinishContext();
+  return 0;
+  __pyx_L1_error:;
+  __Pyx_RefNannyFinishContext();
+  return -1;
+}
+
+static int __Pyx_InitGlobals(void) {
+  if (__Pyx_InitStrings(__pyx_string_tab) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+  __pyx_int_0 = PyInt_FromLong(0); if (unlikely(!__pyx_int_0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_int_1 = PyInt_FromLong(1); if (unlikely(!__pyx_int_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_int_2 = PyInt_FromLong(2); if (unlikely(!__pyx_int_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_int_3 = PyInt_FromLong(3); if (unlikely(!__pyx_int_3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_int_4 = PyInt_FromLong(4); if (unlikely(!__pyx_int_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_int_neg_1 = PyInt_FromLong(-1); if (unlikely(!__pyx_int_neg_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  return 0;
+  __pyx_L1_error:;
+  return -1;
+}
+
+#if PY_MAJOR_VERSION < 3
+PyMODINIT_FUNC initcpu_nms(void); /*proto*/
+PyMODINIT_FUNC initcpu_nms(void)
+#else
+PyMODINIT_FUNC PyInit_cpu_nms(void); /*proto*/
+PyMODINIT_FUNC PyInit_cpu_nms(void)
+#endif
+{
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannyDeclarations
+  #if CYTHON_REFNANNY
+  __Pyx_RefNanny = __Pyx_RefNannyImportAPI("refnanny");
+  if (!__Pyx_RefNanny) {
+      PyErr_Clear();
+      __Pyx_RefNanny = __Pyx_RefNannyImportAPI("Cython.Runtime.refnanny");
+      if (!__Pyx_RefNanny)
+          Py_FatalError("failed to import 'refnanny' module");
+  }
+  #endif
+  __Pyx_RefNannySetupContext("PyMODINIT_FUNC PyInit_cpu_nms(void)", 0);
+  if ( __Pyx_check_binary_version() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_empty_tuple = PyTuple_New(0); if (unlikely(!__pyx_empty_tuple)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_empty_bytes = PyBytes_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_bytes)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  #ifdef __Pyx_CyFunction_USED
+  if (__Pyx_CyFunction_init() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  #endif
+  #ifdef __Pyx_FusedFunction_USED
+  if (__pyx_FusedFunction_init() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  #endif
+  #ifdef __Pyx_Generator_USED
+  if (__pyx_Generator_init() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  #endif
+  /*--- Library function declarations ---*/
+  /*--- Threads initialization code ---*/
+  #if defined(__PYX_FORCE_INIT_THREADS) && __PYX_FORCE_INIT_THREADS
+  #ifdef WITH_THREAD /* Python build with threading support? */
+  PyEval_InitThreads();
+  #endif
+  #endif
+  /*--- Module creation code ---*/
+  #if PY_MAJOR_VERSION < 3
+  __pyx_m = Py_InitModule4(__Pyx_NAMESTR("cpu_nms"), __pyx_methods, 0, 0, PYTHON_API_VERSION); Py_XINCREF(__pyx_m);
+  #else
+  __pyx_m = PyModule_Create(&__pyx_moduledef);
+  #endif
+  if (unlikely(!__pyx_m)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_d = PyModule_GetDict(__pyx_m); if (unlikely(!__pyx_d)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  Py_INCREF(__pyx_d);
+  __pyx_b = PyImport_AddModule(__Pyx_NAMESTR(__Pyx_BUILTIN_MODULE_NAME)); if (unlikely(!__pyx_b)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  #if CYTHON_COMPILING_IN_PYPY
+  Py_INCREF(__pyx_b);
+  #endif
+  if (__Pyx_SetAttrString(__pyx_m, "__builtins__", __pyx_b) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+  /*--- Initialize various global constants etc. ---*/
+  if (unlikely(__Pyx_InitGlobals() < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  #if PY_MAJOR_VERSION < 3 && (__PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT)
+  if (__Pyx_init_sys_getdefaultencoding_params() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  #endif
+  if (__pyx_module_is_main_nms__cpu_nms) {
+    if (__Pyx_SetAttrString(__pyx_m, "__name__", __pyx_n_s_main) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+  }
+  #if PY_MAJOR_VERSION >= 3
+  {
+    PyObject *modules = PyImport_GetModuleDict(); if (unlikely(!modules)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    if (!PyDict_GetItemString(modules, "nms.cpu_nms")) {
+      if (unlikely(PyDict_SetItemString(modules, "nms.cpu_nms", __pyx_m) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    }
+  }
+  #endif
+  /*--- Builtin init code ---*/
+  if (unlikely(__Pyx_InitCachedBuiltins() < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  /*--- Constants init code ---*/
+  if (unlikely(__Pyx_InitCachedConstants() < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  /*--- Global init code ---*/
+  /*--- Variable export code ---*/
+  /*--- Function export code ---*/
+  /*--- Type init code ---*/
+  /*--- Type import code ---*/
+  __pyx_ptype_7cpython_4type_type = __Pyx_ImportType(__Pyx_BUILTIN_MODULE_NAME, "type", 
+  #if CYTHON_COMPILING_IN_PYPY
+  sizeof(PyTypeObject),
+  #else
+  sizeof(PyHeapTypeObject),
+  #endif
+  0); if (unlikely(!__pyx_ptype_7cpython_4type_type)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_ptype_5numpy_dtype = __Pyx_ImportType("numpy", "dtype", sizeof(PyArray_Descr), 0); if (unlikely(!__pyx_ptype_5numpy_dtype)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 155; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_ptype_5numpy_flatiter = __Pyx_ImportType("numpy", "flatiter", sizeof(PyArrayIterObject), 0); if (unlikely(!__pyx_ptype_5numpy_flatiter)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 165; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_ptype_5numpy_broadcast = __Pyx_ImportType("numpy", "broadcast", sizeof(PyArrayMultiIterObject), 0); if (unlikely(!__pyx_ptype_5numpy_broadcast)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 169; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_ptype_5numpy_ndarray = __Pyx_ImportType("numpy", "ndarray", sizeof(PyArrayObject), 0); if (unlikely(!__pyx_ptype_5numpy_ndarray)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 178; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_ptype_5numpy_ufunc = __Pyx_ImportType("numpy", "ufunc", sizeof(PyUFuncObject), 0); if (unlikely(!__pyx_ptype_5numpy_ufunc)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 861; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  /*--- Variable import code ---*/
+  /*--- Function import code ---*/
+  /*--- Execution code ---*/
+
+  /* "nms/cpu_nms.pyx":8
+ * # --------------------------------------------------------
+ * 
+ * import numpy as np             # <<<<<<<<<<<<<<
+ * cimport numpy as np
+ * 
+ */
+  __pyx_t_1 = __Pyx_Import(__pyx_n_s_numpy, 0, -1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "nms/cpu_nms.pyx":17
+ *     return a if a <= b else b
+ * 
+ * def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
+ *     cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_3nms_7cpu_nms_1cpu_nms, NULL, __pyx_n_s_nms_cpu_nms); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_cpu_nms, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "nms/cpu_nms.pyx":1
+ * # --------------------------------------------------------             # <<<<<<<<<<<<<<
+ * # Fast R-CNN
+ * # Copyright (c) 2015 Microsoft
+ */
+  __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":976
+ *      arr.base = baseptr
+ * 
+ * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
+ *     if arr.base is NULL:
+ *         return None
+ */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  if (__pyx_m) {
+    __Pyx_AddTraceback("init nms.cpu_nms", __pyx_clineno, __pyx_lineno, __pyx_filename);
+    Py_DECREF(__pyx_m); __pyx_m = 0;
+  } else if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_ImportError, "init nms.cpu_nms");
+  }
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  #if PY_MAJOR_VERSION < 3
+  return;
+  #else
+  return __pyx_m;
+  #endif
+}
+
+/* Runtime support code */
+#if CYTHON_REFNANNY
+static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname) {
+    PyObject *m = NULL, *p = NULL;
+    void *r = NULL;
+    m = PyImport_ImportModule((char *)modname);
+    if (!m) goto end;
+    p = PyObject_GetAttrString(m, (char *)"RefNannyAPI");
+    if (!p) goto end;
+    r = PyLong_AsVoidPtr(p);
+end:
+    Py_XDECREF(p);
+    Py_XDECREF(m);
+    return (__Pyx_RefNannyAPIStruct *)r;
+}
+#endif /* CYTHON_REFNANNY */
+
+static PyObject *__Pyx_GetBuiltinName(PyObject *name) {
+    PyObject* result = __Pyx_PyObject_GetAttrStr(__pyx_b, name);
+    if (unlikely(!result)) {
+        PyErr_Format(PyExc_NameError,
+#if PY_MAJOR_VERSION >= 3
+            "name '%U' is not defined", name);
+#else
+            "name '%.200s' is not defined", PyString_AS_STRING(name));
+#endif
+    }
+    return result;
+}
+
+static void __Pyx_RaiseArgtupleInvalid(
+    const char* func_name,
+    int exact,
+    Py_ssize_t num_min,
+    Py_ssize_t num_max,
+    Py_ssize_t num_found)
+{
+    Py_ssize_t num_expected;
+    const char *more_or_less;
+    if (num_found < num_min) {
+        num_expected = num_min;
+        more_or_less = "at least";
+    } else {
+        num_expected = num_max;
+        more_or_less = "at most";
+    }
+    if (exact) {
+        more_or_less = "exactly";
+    }
+    PyErr_Format(PyExc_TypeError,
+                 "%.200s() takes %.8s %" CYTHON_FORMAT_SSIZE_T "d positional argument%.1s (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                 func_name, more_or_less, num_expected,
+                 (num_expected == 1) ? "" : "s", num_found);
+}
+
+static void __Pyx_RaiseDoubleKeywordsError(
+    const char* func_name,
+    PyObject* kw_name)
+{
+    PyErr_Format(PyExc_TypeError,
+        #if PY_MAJOR_VERSION >= 3
+        "%s() got multiple values for keyword argument '%U'", func_name, kw_name);
+        #else
+        "%s() got multiple values for keyword argument '%s'", func_name,
+        PyString_AsString(kw_name));
+        #endif
+}
+
+static int __Pyx_ParseOptionalKeywords(
+    PyObject *kwds,
+    PyObject **argnames[],
+    PyObject *kwds2,
+    PyObject *values[],
+    Py_ssize_t num_pos_args,
+    const char* function_name)
+{
+    PyObject *key = 0, *value = 0;
+    Py_ssize_t pos = 0;
+    PyObject*** name;
+    PyObject*** first_kw_arg = argnames + num_pos_args;
+    while (PyDict_Next(kwds, &pos, &key, &value)) {
+        name = first_kw_arg;
+        while (*name && (**name != key)) name++;
+        if (*name) {
+            values[name-argnames] = value;
+            continue;
+        }
+        name = first_kw_arg;
+        #if PY_MAJOR_VERSION < 3
+        if (likely(PyString_CheckExact(key)) || likely(PyString_Check(key))) {
+            while (*name) {
+                if ((CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**name) == PyString_GET_SIZE(key))
+                        && _PyString_Eq(**name, key)) {
+                    values[name-argnames] = value;
+                    break;
+                }
+                name++;
+            }
+            if (*name) continue;
+            else {
+                PyObject*** argname = argnames;
+                while (argname != first_kw_arg) {
+                    if ((**argname == key) || (
+                            (CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**argname) == PyString_GET_SIZE(key))
+                             && _PyString_Eq(**argname, key))) {
+                        goto arg_passed_twice;
+                    }
+                    argname++;
+                }
+            }
+        } else
+        #endif
+        if (likely(PyUnicode_Check(key))) {
+            while (*name) {
+                int cmp = (**name == key) ? 0 :
+                #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
+                    (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 :
+                #endif
+                    PyUnicode_Compare(**name, key);
+                if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
+                if (cmp == 0) {
+                    values[name-argnames] = value;
+                    break;
+                }
+                name++;
+            }
+            if (*name) continue;
+            else {
+                PyObject*** argname = argnames;
+                while (argname != first_kw_arg) {
+                    int cmp = (**argname == key) ? 0 :
+                    #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
+                        (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 :
+                    #endif
+                        PyUnicode_Compare(**argname, key);
+                    if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
+                    if (cmp == 0) goto arg_passed_twice;
+                    argname++;
+                }
+            }
+        } else
+            goto invalid_keyword_type;
+        if (kwds2) {
+            if (unlikely(PyDict_SetItem(kwds2, key, value))) goto bad;
+        } else {
+            goto invalid_keyword;
+        }
+    }
+    return 0;
+arg_passed_twice:
+    __Pyx_RaiseDoubleKeywordsError(function_name, key);
+    goto bad;
+invalid_keyword_type:
+    PyErr_Format(PyExc_TypeError,
+        "%.200s() keywords must be strings", function_name);
+    goto bad;
+invalid_keyword:
+    PyErr_Format(PyExc_TypeError,
+    #if PY_MAJOR_VERSION < 3
+        "%.200s() got an unexpected keyword argument '%.200s'",
+        function_name, PyString_AsString(key));
+    #else
+        "%s() got an unexpected keyword argument '%U'",
+        function_name, key);
+    #endif
+bad:
+    return -1;
+}
+
+static void __Pyx_RaiseArgumentTypeInvalid(const char* name, PyObject *obj, PyTypeObject *type) {
+    PyErr_Format(PyExc_TypeError,
+        "Argument '%.200s' has incorrect type (expected %.200s, got %.200s)",
+        name, type->tp_name, Py_TYPE(obj)->tp_name);
+}
+static CYTHON_INLINE int __Pyx_ArgTypeTest(PyObject *obj, PyTypeObject *type, int none_allowed,
+    const char *name, int exact)
+{
+    if (unlikely(!type)) {
+        PyErr_SetString(PyExc_SystemError, "Missing type object");
+        return 0;
+    }
+    if (none_allowed && obj == Py_None) return 1;
+    else if (exact) {
+        if (likely(Py_TYPE(obj) == type)) return 1;
+        #if PY_MAJOR_VERSION == 2
+        else if ((type == &PyBaseString_Type) && likely(__Pyx_PyBaseString_CheckExact(obj))) return 1;
+        #endif
+    }
+    else {
+        if (likely(PyObject_TypeCheck(obj, type))) return 1;
+    }
+    __Pyx_RaiseArgumentTypeInvalid(name, obj, type);
+    return 0;
+}
+
+static CYTHON_INLINE int __Pyx_IsLittleEndian(void) {
+  unsigned int n = 1;
+  return *(unsigned char*)(&n) != 0;
+}
+static void __Pyx_BufFmt_Init(__Pyx_BufFmt_Context* ctx,
+                              __Pyx_BufFmt_StackElem* stack,
+                              __Pyx_TypeInfo* type) {
+  stack[0].field = &ctx->root;
+  stack[0].parent_offset = 0;
+  ctx->root.type = type;
+  ctx->root.name = "buffer dtype";
+  ctx->root.offset = 0;
+  ctx->head = stack;
+  ctx->head->field = &ctx->root;
+  ctx->fmt_offset = 0;
+  ctx->head->parent_offset = 0;
+  ctx->new_packmode = '@';
+  ctx->enc_packmode = '@';
+  ctx->new_count = 1;
+  ctx->enc_count = 0;
+  ctx->enc_type = 0;
+  ctx->is_complex = 0;
+  ctx->is_valid_array = 0;
+  ctx->struct_alignment = 0;
+  while (type->typegroup == 'S') {
+    ++ctx->head;
+    ctx->head->field = type->fields;
+    ctx->head->parent_offset = 0;
+    type = type->fields->type;
+  }
+}
+static int __Pyx_BufFmt_ParseNumber(const char** ts) {
+    int count;
+    const char* t = *ts;
+    if (*t < '0' || *t > '9') {
+      return -1;
+    } else {
+        count = *t++ - '0';
+        while (*t >= '0' && *t < '9') {
+            count *= 10;
+            count += *t++ - '0';
+        }
+    }
+    *ts = t;
+    return count;
+}
+static int __Pyx_BufFmt_ExpectNumber(const char **ts) {
+    int number = __Pyx_BufFmt_ParseNumber(ts);
+    if (number == -1) /* First char was not a digit */
+        PyErr_Format(PyExc_ValueError,\
+                     "Does not understand character buffer dtype format string ('%c')", **ts);
+    return number;
+}
+static void __Pyx_BufFmt_RaiseUnexpectedChar(char ch) {
+  PyErr_Format(PyExc_ValueError,
+               "Unexpected format string character: '%c'", ch);
+}
+static const char* __Pyx_BufFmt_DescribeTypeChar(char ch, int is_complex) {
+  switch (ch) {
+    case 'c': return "'char'";
+    case 'b': return "'signed char'";
+    case 'B': return "'unsigned char'";
+    case 'h': return "'short'";
+    case 'H': return "'unsigned short'";
+    case 'i': return "'int'";
+    case 'I': return "'unsigned int'";
+    case 'l': return "'long'";
+    case 'L': return "'unsigned long'";
+    case 'q': return "'long long'";
+    case 'Q': return "'unsigned long long'";
+    case 'f': return (is_complex ? "'complex float'" : "'float'");
+    case 'd': return (is_complex ? "'complex double'" : "'double'");
+    case 'g': return (is_complex ? "'complex long double'" : "'long double'");
+    case 'T': return "a struct";
+    case 'O': return "Python object";
+    case 'P': return "a pointer";
+    case 's': case 'p': return "a string";
+    case 0: return "end";
+    default: return "unparseable format string";
+  }
+}
+static size_t __Pyx_BufFmt_TypeCharToStandardSize(char ch, int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return 2;
+    case 'i': case 'I': case 'l': case 'L': return 4;
+    case 'q': case 'Q': return 8;
+    case 'f': return (is_complex ? 8 : 4);
+    case 'd': return (is_complex ? 16 : 8);
+    case 'g': {
+      PyErr_SetString(PyExc_ValueError, "Python does not define a standard format string size for long double ('g')..");
+      return 0;
+    }
+    case 'O': case 'P': return sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+static size_t __Pyx_BufFmt_TypeCharToNativeSize(char ch, int is_complex) {
+  switch (ch) {
+    case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(short);
+    case 'i': case 'I': return sizeof(int);
+    case 'l': case 'L': return sizeof(long);
+    #ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(PY_LONG_LONG);
+    #endif
+    case 'f': return sizeof(float) * (is_complex ? 2 : 1);
+    case 'd': return sizeof(double) * (is_complex ? 2 : 1);
+    case 'g': return sizeof(long double) * (is_complex ? 2 : 1);
+    case 'O': case 'P': return sizeof(void*);
+    default: {
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+  }
+}
+typedef struct { char c; short x; } __Pyx_st_short;
+typedef struct { char c; int x; } __Pyx_st_int;
+typedef struct { char c; long x; } __Pyx_st_long;
+typedef struct { char c; float x; } __Pyx_st_float;
+typedef struct { char c; double x; } __Pyx_st_double;
+typedef struct { char c; long double x; } __Pyx_st_longdouble;
+typedef struct { char c; void *x; } __Pyx_st_void_p;
+#ifdef HAVE_LONG_LONG
+typedef struct { char c; PY_LONG_LONG x; } __Pyx_st_longlong;
+#endif
+static size_t __Pyx_BufFmt_TypeCharToAlignment(char ch, CYTHON_UNUSED int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(__Pyx_st_short) - sizeof(short);
+    case 'i': case 'I': return sizeof(__Pyx_st_int) - sizeof(int);
+    case 'l': case 'L': return sizeof(__Pyx_st_long) - sizeof(long);
+#ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(__Pyx_st_longlong) - sizeof(PY_LONG_LONG);
+#endif
+    case 'f': return sizeof(__Pyx_st_float) - sizeof(float);
+    case 'd': return sizeof(__Pyx_st_double) - sizeof(double);
+    case 'g': return sizeof(__Pyx_st_longdouble) - sizeof(long double);
+    case 'P': case 'O': return sizeof(__Pyx_st_void_p) - sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+/* These are for computing the padding at the end of the struct to align
+   on the first member of the struct. This will probably the same as above,
+   but we don't have any guarantees.
+ */
+typedef struct { short x; char c; } __Pyx_pad_short;
+typedef struct { int x; char c; } __Pyx_pad_int;
+typedef struct { long x; char c; } __Pyx_pad_long;
+typedef struct { float x; char c; } __Pyx_pad_float;
+typedef struct { double x; char c; } __Pyx_pad_double;
+typedef struct { long double x; char c; } __Pyx_pad_longdouble;
+typedef struct { void *x; char c; } __Pyx_pad_void_p;
+#ifdef HAVE_LONG_LONG
+typedef struct { PY_LONG_LONG x; char c; } __Pyx_pad_longlong;
+#endif
+static size_t __Pyx_BufFmt_TypeCharToPadding(char ch, CYTHON_UNUSED int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(__Pyx_pad_short) - sizeof(short);
+    case 'i': case 'I': return sizeof(__Pyx_pad_int) - sizeof(int);
+    case 'l': case 'L': return sizeof(__Pyx_pad_long) - sizeof(long);
+#ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(__Pyx_pad_longlong) - sizeof(PY_LONG_LONG);
+#endif
+    case 'f': return sizeof(__Pyx_pad_float) - sizeof(float);
+    case 'd': return sizeof(__Pyx_pad_double) - sizeof(double);
+    case 'g': return sizeof(__Pyx_pad_longdouble) - sizeof(long double);
+    case 'P': case 'O': return sizeof(__Pyx_pad_void_p) - sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+static char __Pyx_BufFmt_TypeCharToGroup(char ch, int is_complex) {
+  switch (ch) {
+    case 'c':
+        return 'H';
+    case 'b': case 'h': case 'i':
+    case 'l': case 'q': case 's': case 'p':
+        return 'I';
+    case 'B': case 'H': case 'I': case 'L': case 'Q':
+        return 'U';
+    case 'f': case 'd': case 'g':
+        return (is_complex ? 'C' : 'R');
+    case 'O':
+        return 'O';
+    case 'P':
+        return 'P';
+    default: {
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+  }
+}
+static void __Pyx_BufFmt_RaiseExpected(__Pyx_BufFmt_Context* ctx) {
+  if (ctx->head == NULL || ctx->head->field == &ctx->root) {
+    const char* expected;
+    const char* quote;
+    if (ctx->head == NULL) {
+      expected = "end";
+      quote = "";
+    } else {
+      expected = ctx->head->field->type->name;
+      quote = "'";
+    }
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer dtype mismatch, expected %s%s%s but got %s",
+                 quote, expected, quote,
+                 __Pyx_BufFmt_DescribeTypeChar(ctx->enc_type, ctx->is_complex));
+  } else {
+    __Pyx_StructField* field = ctx->head->field;
+    __Pyx_StructField* parent = (ctx->head - 1)->field;
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer dtype mismatch, expected '%s' but got %s in '%s.%s'",
+                 field->type->name, __Pyx_BufFmt_DescribeTypeChar(ctx->enc_type, ctx->is_complex),
+                 parent->type->name, field->name);
+  }
+}
+static int __Pyx_BufFmt_ProcessTypeChunk(__Pyx_BufFmt_Context* ctx) {
+  char group;
+  size_t size, offset, arraysize = 1;
+  if (ctx->enc_type == 0) return 0;
+  if (ctx->head->field->type->arraysize[0]) {
+    int i, ndim = 0;
+    if (ctx->enc_type == 's' || ctx->enc_type == 'p') {
+        ctx->is_valid_array = ctx->head->field->type->ndim == 1;
+        ndim = 1;
+        if (ctx->enc_count != ctx->head->field->type->arraysize[0]) {
+            PyErr_Format(PyExc_ValueError,
+                         "Expected a dimension of size %zu, got %zu",
+                         ctx->head->field->type->arraysize[0], ctx->enc_count);
+            return -1;
+        }
+    }
+    if (!ctx->is_valid_array) {
+      PyErr_Format(PyExc_ValueError, "Expected %d dimensions, got %d",
+                   ctx->head->field->type->ndim, ndim);
+      return -1;
+    }
+    for (i = 0; i < ctx->head->field->type->ndim; i++) {
+      arraysize *= ctx->head->field->type->arraysize[i];
+    }
+    ctx->is_valid_array = 0;
+    ctx->enc_count = 1;
+  }
+  group = __Pyx_BufFmt_TypeCharToGroup(ctx->enc_type, ctx->is_complex);
+  do {
+    __Pyx_StructField* field = ctx->head->field;
+    __Pyx_TypeInfo* type = field->type;
+    if (ctx->enc_packmode == '@' || ctx->enc_packmode == '^') {
+      size = __Pyx_BufFmt_TypeCharToNativeSize(ctx->enc_type, ctx->is_complex);
+    } else {
+      size = __Pyx_BufFmt_TypeCharToStandardSize(ctx->enc_type, ctx->is_complex);
+    }
+    if (ctx->enc_packmode == '@') {
+      size_t align_at = __Pyx_BufFmt_TypeCharToAlignment(ctx->enc_type, ctx->is_complex);
+      size_t align_mod_offset;
+      if (align_at == 0) return -1;
+      align_mod_offset = ctx->fmt_offset % align_at;
+      if (align_mod_offset > 0) ctx->fmt_offset += align_at - align_mod_offset;
+      if (ctx->struct_alignment == 0)
+          ctx->struct_alignment = __Pyx_BufFmt_TypeCharToPadding(ctx->enc_type,
+                                                                 ctx->is_complex);
+    }
+    if (type->size != size || type->typegroup != group) {
+      if (type->typegroup == 'C' && type->fields != NULL) {
+        size_t parent_offset = ctx->head->parent_offset + field->offset;
+        ++ctx->head;
+        ctx->head->field = type->fields;
+        ctx->head->parent_offset = parent_offset;
+        continue;
+      }
+      if ((type->typegroup == 'H' || group == 'H') && type->size == size) {
+      } else {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return -1;
+      }
+    }
+    offset = ctx->head->parent_offset + field->offset;
+    if (ctx->fmt_offset != offset) {
+      PyErr_Format(PyExc_ValueError,
+                   "Buffer dtype mismatch; next field is at offset %" CYTHON_FORMAT_SSIZE_T "d but %" CYTHON_FORMAT_SSIZE_T "d expected",
+                   (Py_ssize_t)ctx->fmt_offset, (Py_ssize_t)offset);
+      return -1;
+    }
+    ctx->fmt_offset += size;
+    if (arraysize)
+      ctx->fmt_offset += (arraysize - 1) * size;
+    --ctx->enc_count; /* Consume from buffer string */
+    while (1) {
+      if (field == &ctx->root) {
+        ctx->head = NULL;
+        if (ctx->enc_count != 0) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return -1;
+        }
+        break; /* breaks both loops as ctx->enc_count == 0 */
+      }
+      ctx->head->field = ++field;
+      if (field->type == NULL) {
+        --ctx->head;
+        field = ctx->head->field;
+        continue;
+      } else if (field->type->typegroup == 'S') {
+        size_t parent_offset = ctx->head->parent_offset + field->offset;
+        if (field->type->fields->type == NULL) continue; /* empty struct */
+        field = field->type->fields;
+        ++ctx->head;
+        ctx->head->field = field;
+        ctx->head->parent_offset = parent_offset;
+        break;
+      } else {
+        break;
+      }
+    }
+  } while (ctx->enc_count);
+  ctx->enc_type = 0;
+  ctx->is_complex = 0;
+  return 0;
+}
+static CYTHON_INLINE PyObject *
+__pyx_buffmt_parse_array(__Pyx_BufFmt_Context* ctx, const char** tsp)
+{
+    const char *ts = *tsp;
+    int i = 0, number;
+    int ndim = ctx->head->field->type->ndim;
+;
+    ++ts;
+    if (ctx->new_count != 1) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Cannot handle repeated arrays in format string");
+        return NULL;
+    }
+    if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+    while (*ts && *ts != ')') {
+        switch (*ts) {
+            case ' ': case '\f': case '\r': case '\n': case '\t': case '\v':  continue;
+            default:  break;  /* not a 'break' in the loop */
+        }
+        number = __Pyx_BufFmt_ExpectNumber(&ts);
+        if (number == -1) return NULL;
+        if (i < ndim && (size_t) number != ctx->head->field->type->arraysize[i])
+            return PyErr_Format(PyExc_ValueError,
+                        "Expected a dimension of size %zu, got %d",
+                        ctx->head->field->type->arraysize[i], number);
+        if (*ts != ',' && *ts != ')')
+            return PyErr_Format(PyExc_ValueError,
+                                "Expected a comma in format string, got '%c'", *ts);
+        if (*ts == ',') ts++;
+        i++;
+    }
+    if (i != ndim)
+        return PyErr_Format(PyExc_ValueError, "Expected %d dimension(s), got %d",
+                            ctx->head->field->type->ndim, i);
+    if (!*ts) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Unexpected end of format string, expected ')'");
+        return NULL;
+    }
+    ctx->is_valid_array = 1;
+    ctx->new_count = 1;
+    *tsp = ++ts;
+    return Py_None;
+}
+static const char* __Pyx_BufFmt_CheckString(__Pyx_BufFmt_Context* ctx, const char* ts) {
+  int got_Z = 0;
+  while (1) {
+    switch(*ts) {
+      case 0:
+        if (ctx->enc_type != 0 && ctx->head == NULL) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return NULL;
+        }
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        if (ctx->head != NULL) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return NULL;
+        }
+                return ts;
+      case ' ':
+      case 10:
+      case 13:
+        ++ts;
+        break;
+      case '<':
+        if (!__Pyx_IsLittleEndian()) {
+          PyErr_SetString(PyExc_ValueError, "Little-endian buffer not supported on big-endian compiler");
+          return NULL;
+        }
+        ctx->new_packmode = '=';
+        ++ts;
+        break;
+      case '>':
+      case '!':
+        if (__Pyx_IsLittleEndian()) {
+          PyErr_SetString(PyExc_ValueError, "Big-endian buffer not supported on little-endian compiler");
+          return NULL;
+        }
+        ctx->new_packmode = '=';
+        ++ts;
+        break;
+      case '=':
+      case '@':
+      case '^':
+        ctx->new_packmode = *ts++;
+        break;
+      case 'T': /* substruct */
+        {
+          const char* ts_after_sub;
+          size_t i, struct_count = ctx->new_count;
+          size_t struct_alignment = ctx->struct_alignment;
+          ctx->new_count = 1;
+          ++ts;
+          if (*ts != '{') {
+            PyErr_SetString(PyExc_ValueError, "Buffer acquisition: Expected '{' after 'T'");
+            return NULL;
+          }
+          if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+          ctx->enc_type = 0; /* Erase processed last struct element */
+          ctx->enc_count = 0;
+          ctx->struct_alignment = 0;
+          ++ts;
+          ts_after_sub = ts;
+          for (i = 0; i != struct_count; ++i) {
+            ts_after_sub = __Pyx_BufFmt_CheckString(ctx, ts);
+            if (!ts_after_sub) return NULL;
+          }
+          ts = ts_after_sub;
+          if (struct_alignment) ctx->struct_alignment = struct_alignment;
+        }
+        break;
+      case '}': /* end of substruct; either repeat or move on */
+        {
+          size_t alignment = ctx->struct_alignment;
+          ++ts;
+          if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+          ctx->enc_type = 0; /* Erase processed last struct element */
+          if (alignment && ctx->fmt_offset % alignment) {
+            ctx->fmt_offset += alignment - (ctx->fmt_offset % alignment);
+          }
+        }
+        return ts;
+      case 'x':
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        ctx->fmt_offset += ctx->new_count;
+        ctx->new_count = 1;
+        ctx->enc_count = 0;
+        ctx->enc_type = 0;
+        ctx->enc_packmode = ctx->new_packmode;
+        ++ts;
+        break;
+      case 'Z':
+        got_Z = 1;
+        ++ts;
+        if (*ts != 'f' && *ts != 'd' && *ts != 'g') {
+          __Pyx_BufFmt_RaiseUnexpectedChar('Z');
+          return NULL;
+        }        /* fall through */
+      case 'c': case 'b': case 'B': case 'h': case 'H': case 'i': case 'I':
+      case 'l': case 'L': case 'q': case 'Q':
+      case 'f': case 'd': case 'g':
+      case 'O': case 's': case 'p':
+        if (ctx->enc_type == *ts && got_Z == ctx->is_complex &&
+            ctx->enc_packmode == ctx->new_packmode) {
+          ctx->enc_count += ctx->new_count;
+        } else {
+          if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+          ctx->enc_count = ctx->new_count;
+          ctx->enc_packmode = ctx->new_packmode;
+          ctx->enc_type = *ts;
+          ctx->is_complex = got_Z;
+        }
+        ++ts;
+        ctx->new_count = 1;
+        got_Z = 0;
+        break;
+      case ':':
+        ++ts;
+        while(*ts != ':') ++ts;
+        ++ts;
+        break;
+      case '(':
+        if (!__pyx_buffmt_parse_array(ctx, &ts)) return NULL;
+        break;
+      default:
+        {
+          int number = __Pyx_BufFmt_ExpectNumber(&ts);
+          if (number == -1) return NULL;
+          ctx->new_count = (size_t)number;
+        }
+    }
+  }
+}
+static CYTHON_INLINE void __Pyx_ZeroBuffer(Py_buffer* buf) {
+  buf->buf = NULL;
+  buf->obj = NULL;
+  buf->strides = __Pyx_zeros;
+  buf->shape = __Pyx_zeros;
+  buf->suboffsets = __Pyx_minusones;
+}
+static CYTHON_INLINE int __Pyx_GetBufferAndValidate(
+        Py_buffer* buf, PyObject* obj,  __Pyx_TypeInfo* dtype, int flags,
+        int nd, int cast, __Pyx_BufFmt_StackElem* stack)
+{
+  if (obj == Py_None || obj == NULL) {
+    __Pyx_ZeroBuffer(buf);
+    return 0;
+  }
+  buf->buf = NULL;
+  if (__Pyx_GetBuffer(obj, buf, flags) == -1) goto fail;
+  if (buf->ndim != nd) {
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer has wrong number of dimensions (expected %d, got %d)",
+                 nd, buf->ndim);
+    goto fail;
+  }
+  if (!cast) {
+    __Pyx_BufFmt_Context ctx;
+    __Pyx_BufFmt_Init(&ctx, stack, dtype);
+    if (!__Pyx_BufFmt_CheckString(&ctx, buf->format)) goto fail;
+  }
+  if ((unsigned)buf->itemsize != dtype->size) {
+    PyErr_Format(PyExc_ValueError,
+      "Item size of buffer (%" CYTHON_FORMAT_SSIZE_T "d byte%s) does not match size of '%s' (%" CYTHON_FORMAT_SSIZE_T "d byte%s)",
+      buf->itemsize, (buf->itemsize > 1) ? "s" : "",
+      dtype->name, (Py_ssize_t)dtype->size, (dtype->size > 1) ? "s" : "");
+    goto fail;
+  }
+  if (buf->suboffsets == NULL) buf->suboffsets = __Pyx_minusones;
+  return 0;
+fail:;
+  __Pyx_ZeroBuffer(buf);
+  return -1;
+}
+static CYTHON_INLINE void __Pyx_SafeReleaseBuffer(Py_buffer* info) {
+  if (info->buf == NULL) return;
+  if (info->suboffsets == __Pyx_minusones) info->suboffsets = NULL;
+  __Pyx_ReleaseBuffer(info);
+}
+
+static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type) {
+    if (unlikely(!type)) {
+        PyErr_SetString(PyExc_SystemError, "Missing type object");
+        return 0;
+    }
+    if (likely(PyObject_TypeCheck(obj, type)))
+        return 1;
+    PyErr_Format(PyExc_TypeError, "Cannot convert %.200s to %.200s",
+                 Py_TYPE(obj)->tp_name, type->tp_name);
+    return 0;
+}
+
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw) {
+    PyObject *result;
+    ternaryfunc call = func->ob_type->tp_call;
+    if (unlikely(!call))
+        return PyObject_Call(func, arg, kw);
+#if PY_VERSION_HEX >= 0x02060000
+    if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object")))
+        return NULL;
+#endif
+    result = (*call)(func, arg, kw);
+#if PY_VERSION_HEX >= 0x02060000
+    Py_LeaveRecursiveCall();
+#endif
+    if (unlikely(!result) && unlikely(!PyErr_Occurred())) {
+        PyErr_SetString(
+            PyExc_SystemError,
+            "NULL result without error in PyObject_Call");
+    }
+    return result;
+}
+#endif
+
+static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name) {
+    PyObject *result;
+#if CYTHON_COMPILING_IN_CPYTHON
+    result = PyDict_GetItem(__pyx_d, name);
+    if (result) {
+        Py_INCREF(result);
+    } else {
+#else
+    result = PyObject_GetItem(__pyx_d, name);
+    if (!result) {
+        PyErr_Clear();
+#endif
+        result = __Pyx_GetBuiltinName(name);
+    }
+    return result;
+}
+
+static void __Pyx_RaiseBufferIndexError(int axis) {
+  PyErr_Format(PyExc_IndexError,
+     "Out of bounds on buffer access (axis %d)", axis);
+}
+
+static CYTHON_INLINE void __Pyx_ErrRestore(PyObject *type, PyObject *value, PyObject *tb) {
+#if CYTHON_COMPILING_IN_CPYTHON
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    PyThreadState *tstate = PyThreadState_GET();
+    tmp_type = tstate->curexc_type;
+    tmp_value = tstate->curexc_value;
+    tmp_tb = tstate->curexc_traceback;
+    tstate->curexc_type = type;
+    tstate->curexc_value = value;
+    tstate->curexc_traceback = tb;
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+#else
+    PyErr_Restore(type, value, tb);
+#endif
+}
+static CYTHON_INLINE void __Pyx_ErrFetch(PyObject **type, PyObject **value, PyObject **tb) {
+#if CYTHON_COMPILING_IN_CPYTHON
+    PyThreadState *tstate = PyThreadState_GET();
+    *type = tstate->curexc_type;
+    *value = tstate->curexc_value;
+    *tb = tstate->curexc_traceback;
+    tstate->curexc_type = 0;
+    tstate->curexc_value = 0;
+    tstate->curexc_traceback = 0;
+#else
+    PyErr_Fetch(type, value, tb);
+#endif
+}
+
+#if PY_MAJOR_VERSION < 3
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb,
+                        CYTHON_UNUSED PyObject *cause) {
+    Py_XINCREF(type);
+    if (!value || value == Py_None)
+        value = NULL;
+    else
+        Py_INCREF(value);
+    if (!tb || tb == Py_None)
+        tb = NULL;
+    else {
+        Py_INCREF(tb);
+        if (!PyTraceBack_Check(tb)) {
+            PyErr_SetString(PyExc_TypeError,
+                "raise: arg 3 must be a traceback or None");
+            goto raise_error;
+        }
+    }
+    #if PY_VERSION_HEX < 0x02050000
+    if (PyClass_Check(type)) {
+    #else
+    if (PyType_Check(type)) {
+    #endif
+#if CYTHON_COMPILING_IN_PYPY
+        if (!value) {
+            Py_INCREF(Py_None);
+            value = Py_None;
+        }
+#endif
+        PyErr_NormalizeException(&type, &value, &tb);
+    } else {
+        if (value) {
+            PyErr_SetString(PyExc_TypeError,
+                "instance exception may not have a separate value");
+            goto raise_error;
+        }
+        value = type;
+        #if PY_VERSION_HEX < 0x02050000
+        if (PyInstance_Check(type)) {
+            type = (PyObject*) ((PyInstanceObject*)type)->in_class;
+            Py_INCREF(type);
+        } else {
+            type = 0;
+            PyErr_SetString(PyExc_TypeError,
+                "raise: exception must be an old-style class or instance");
+            goto raise_error;
+        }
+        #else
+        type = (PyObject*) Py_TYPE(type);
+        Py_INCREF(type);
+        if (!PyType_IsSubtype((PyTypeObject *)type, (PyTypeObject *)PyExc_BaseException)) {
+            PyErr_SetString(PyExc_TypeError,
+                "raise: exception class must be a subclass of BaseException");
+            goto raise_error;
+        }
+        #endif
+    }
+    __Pyx_ErrRestore(type, value, tb);
+    return;
+raise_error:
+    Py_XDECREF(value);
+    Py_XDECREF(type);
+    Py_XDECREF(tb);
+    return;
+}
+#else /* Python 3+ */
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause) {
+    PyObject* owned_instance = NULL;
+    if (tb == Py_None) {
+        tb = 0;
+    } else if (tb && !PyTraceBack_Check(tb)) {
+        PyErr_SetString(PyExc_TypeError,
+            "raise: arg 3 must be a traceback or None");
+        goto bad;
+    }
+    if (value == Py_None)
+        value = 0;
+    if (PyExceptionInstance_Check(type)) {
+        if (value) {
+            PyErr_SetString(PyExc_TypeError,
+                "instance exception may not have a separate value");
+            goto bad;
+        }
+        value = type;
+        type = (PyObject*) Py_TYPE(value);
+    } else if (PyExceptionClass_Check(type)) {
+        PyObject *instance_class = NULL;
+        if (value && PyExceptionInstance_Check(value)) {
+            instance_class = (PyObject*) Py_TYPE(value);
+            if (instance_class != type) {
+                if (PyObject_IsSubclass(instance_class, type)) {
+                    type = instance_class;
+                } else {
+                    instance_class = NULL;
+                }
+            }
+        }
+        if (!instance_class) {
+            PyObject *args;
+            if (!value)
+                args = PyTuple_New(0);
+            else if (PyTuple_Check(value)) {
+                Py_INCREF(value);
+                args = value;
+            } else
+                args = PyTuple_Pack(1, value);
+            if (!args)
+                goto bad;
+            owned_instance = PyObject_Call(type, args, NULL);
+            Py_DECREF(args);
+            if (!owned_instance)
+                goto bad;
+            value = owned_instance;
+            if (!PyExceptionInstance_Check(value)) {
+                PyErr_Format(PyExc_TypeError,
+                             "calling %R should have returned an instance of "
+                             "BaseException, not %R",
+                             type, Py_TYPE(value));
+                goto bad;
+            }
+        }
+    } else {
+        PyErr_SetString(PyExc_TypeError,
+            "raise: exception class must be a subclass of BaseException");
+        goto bad;
+    }
+#if PY_VERSION_HEX >= 0x03030000
+    if (cause) {
+#else
+    if (cause && cause != Py_None) {
+#endif
+        PyObject *fixed_cause;
+        if (cause == Py_None) {
+            fixed_cause = NULL;
+        } else if (PyExceptionClass_Check(cause)) {
+            fixed_cause = PyObject_CallObject(cause, NULL);
+            if (fixed_cause == NULL)
+                goto bad;
+        } else if (PyExceptionInstance_Check(cause)) {
+            fixed_cause = cause;
+            Py_INCREF(fixed_cause);
+        } else {
+            PyErr_SetString(PyExc_TypeError,
+                            "exception causes must derive from "
+                            "BaseException");
+            goto bad;
+        }
+        PyException_SetCause(value, fixed_cause);
+    }
+    PyErr_SetObject(type, value);
+    if (tb) {
+        PyThreadState *tstate = PyThreadState_GET();
+        PyObject* tmp_tb = tstate->curexc_traceback;
+        if (tb != tmp_tb) {
+            Py_INCREF(tb);
+            tstate->curexc_traceback = tb;
+            Py_XDECREF(tmp_tb);
+        }
+    }
+bad:
+    Py_XDECREF(owned_instance);
+    return;
+}
+#endif
+
+static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected) {
+    PyErr_Format(PyExc_ValueError,
+                 "too many values to unpack (expected %" CYTHON_FORMAT_SSIZE_T "d)", expected);
+}
+
+static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index) {
+    PyErr_Format(PyExc_ValueError,
+                 "need more than %" CYTHON_FORMAT_SSIZE_T "d value%.1s to unpack",
+                 index, (index == 1) ? "" : "s");
+}
+
+static CYTHON_INLINE void __Pyx_RaiseNoneNotIterableError(void) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable");
+}
+
+#if PY_MAJOR_VERSION < 3
+static int __Pyx_GetBuffer(PyObject *obj, Py_buffer *view, int flags) {
+  #if PY_VERSION_HEX >= 0x02060000
+    if (PyObject_CheckBuffer(obj)) return PyObject_GetBuffer(obj, view, flags);
+  #endif
+        if (PyObject_TypeCheck(obj, __pyx_ptype_5numpy_ndarray)) return __pyx_pw_5numpy_7ndarray_1__getbuffer__(obj, view, flags);
+  #if PY_VERSION_HEX < 0x02060000
+    if (obj->ob_type->tp_dict) {
+        PyObject *getbuffer_cobj = PyObject_GetItem(
+            obj->ob_type->tp_dict, __pyx_n_s_pyx_getbuffer);
+        if (getbuffer_cobj) {
+            getbufferproc func = (getbufferproc) PyCObject_AsVoidPtr(getbuffer_cobj);
+            Py_DECREF(getbuffer_cobj);
+            if (!func)
+                goto fail;
+            return func(obj, view, flags);
+        } else {
+            PyErr_Clear();
+        }
+    }
+  #endif
+    PyErr_Format(PyExc_TypeError, "'%.200s' does not have the buffer interface", Py_TYPE(obj)->tp_name);
+#if PY_VERSION_HEX < 0x02060000
+fail:
+#endif
+    return -1;
+}
+static void __Pyx_ReleaseBuffer(Py_buffer *view) {
+    PyObject *obj = view->obj;
+    if (!obj) return;
+  #if PY_VERSION_HEX >= 0x02060000
+    if (PyObject_CheckBuffer(obj)) {
+        PyBuffer_Release(view);
+        return;
+    }
+  #endif
+        if (PyObject_TypeCheck(obj, __pyx_ptype_5numpy_ndarray)) { __pyx_pw_5numpy_7ndarray_3__releasebuffer__(obj, view); return; }
+  #if PY_VERSION_HEX < 0x02060000
+    if (obj->ob_type->tp_dict) {
+        PyObject *releasebuffer_cobj = PyObject_GetItem(
+            obj->ob_type->tp_dict, __pyx_n_s_pyx_releasebuffer);
+        if (releasebuffer_cobj) {
+            releasebufferproc func = (releasebufferproc) PyCObject_AsVoidPtr(releasebuffer_cobj);
+            Py_DECREF(releasebuffer_cobj);
+            if (!func)
+                goto fail;
+            func(obj, view);
+            return;
+        } else {
+            PyErr_Clear();
+        }
+    }
+  #endif
+    goto nofail;
+#if PY_VERSION_HEX < 0x02060000
+fail:
+#endif
+    PyErr_WriteUnraisable(obj);
+nofail:
+    Py_DECREF(obj);
+    view->obj = NULL;
+}
+#endif /*  PY_MAJOR_VERSION < 3 */
+
+
+        static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level) {
+    PyObject *empty_list = 0;
+    PyObject *module = 0;
+    PyObject *global_dict = 0;
+    PyObject *empty_dict = 0;
+    PyObject *list;
+    #if PY_VERSION_HEX < 0x03030000
+    PyObject *py_import;
+    py_import = __Pyx_PyObject_GetAttrStr(__pyx_b, __pyx_n_s_import);
+    if (!py_import)
+        goto bad;
+    #endif
+    if (from_list)
+        list = from_list;
+    else {
+        empty_list = PyList_New(0);
+        if (!empty_list)
+            goto bad;
+        list = empty_list;
+    }
+    global_dict = PyModule_GetDict(__pyx_m);
+    if (!global_dict)
+        goto bad;
+    empty_dict = PyDict_New();
+    if (!empty_dict)
+        goto bad;
+    #if PY_VERSION_HEX >= 0x02050000
+    {
+        #if PY_MAJOR_VERSION >= 3
+        if (level == -1) {
+            if (strchr(__Pyx_MODULE_NAME, '.')) {
+                #if PY_VERSION_HEX < 0x03030000
+                PyObject *py_level = PyInt_FromLong(1);
+                if (!py_level)
+                    goto bad;
+                module = PyObject_CallFunctionObjArgs(py_import,
+                    name, global_dict, empty_dict, list, py_level, NULL);
+                Py_DECREF(py_level);
+                #else
+                module = PyImport_ImportModuleLevelObject(
+                    name, global_dict, empty_dict, list, 1);
+                #endif
+                if (!module) {
+                    if (!PyErr_ExceptionMatches(PyExc_ImportError))
+                        goto bad;
+                    PyErr_Clear();
+                }
+            }
+            level = 0; /* try absolute import on failure */
+        }
+        #endif
+        if (!module) {
+            #if PY_VERSION_HEX < 0x03030000
+            PyObject *py_level = PyInt_FromLong(level);
+            if (!py_level)
+                goto bad;
+            module = PyObject_CallFunctionObjArgs(py_import,
+                name, global_dict, empty_dict, list, py_level, NULL);
+            Py_DECREF(py_level);
+            #else
+            module = PyImport_ImportModuleLevelObject(
+                name, global_dict, empty_dict, list, level);
+            #endif
+        }
+    }
+    #else
+    if (level>0) {
+        PyErr_SetString(PyExc_RuntimeError, "Relative import is not supported for Python <=2.4.");
+        goto bad;
+    }
+    module = PyObject_CallFunctionObjArgs(py_import,
+        name, global_dict, empty_dict, list, NULL);
+    #endif
+bad:
+    #if PY_VERSION_HEX < 0x03030000
+    Py_XDECREF(py_import);
+    #endif
+    Py_XDECREF(empty_list);
+    Py_XDECREF(empty_dict);
+    return module;
+}
+
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value) {
+    const int neg_one = (int) -1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(int) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(int) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+        } else if (sizeof(int) <= sizeof(unsigned long long)) {
+            return PyLong_FromUnsignedLongLong((unsigned long long) value);
+        }
+    } else {
+        if (sizeof(int) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(int) <= sizeof(long long)) {
+            return PyLong_FromLongLong((long long) value);
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(int),
+                                     little, !is_unsigned);
+    }
+}
+
+#define __PYX_VERIFY_RETURN_INT(target_type, func_type, func)             \
+    {                                                                     \
+        func_type value = func(x);                                        \
+        if (sizeof(target_type) < sizeof(func_type)) {                    \
+            if (unlikely(value != (func_type) (target_type) value)) {     \
+                func_type zero = 0;                                       \
+                PyErr_SetString(PyExc_OverflowError,                      \
+                    (is_unsigned && unlikely(value < zero)) ?             \
+                    "can't convert negative value to " #target_type :     \
+                    "value too large to convert to " #target_type);       \
+                return (target_type) -1;                                  \
+            }                                                             \
+        }                                                                 \
+        return (target_type) value;                                       \
+    }
+
+#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
+ #if CYTHON_USE_PYLONG_INTERNALS
+  #include "longintrepr.h"
+ #endif
+#endif
+static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *x) {
+    const int neg_one = (int) -1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(int) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(int, long, PyInt_AS_LONG)
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                PyErr_SetString(PyExc_OverflowError,
+                                "can't convert negative value to int");
+                return (int) -1;
+            }
+            return (int) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
+ #if CYTHON_USE_PYLONG_INTERNALS
+            if (sizeof(digit) <= sizeof(int)) {
+                switch (Py_SIZE(x)) {
+                    case  0: return 0;
+                    case  1: return (int) ((PyLongObject*)x)->ob_digit[0];
+                }
+            }
+ #endif
+#endif
+            if (unlikely(Py_SIZE(x) < 0)) {
+                PyErr_SetString(PyExc_OverflowError,
+                                "can't convert negative value to int");
+                return (int) -1;
+            }
+            if (sizeof(int) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT(int, unsigned long, PyLong_AsUnsignedLong)
+            } else if (sizeof(int) <= sizeof(unsigned long long)) {
+                __PYX_VERIFY_RETURN_INT(int, unsigned long long, PyLong_AsUnsignedLongLong)
+            }
+        } else {
+#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
+ #if CYTHON_USE_PYLONG_INTERNALS
+            if (sizeof(digit) <= sizeof(int)) {
+                switch (Py_SIZE(x)) {
+                    case  0: return 0;
+                    case  1: return +(int) ((PyLongObject*)x)->ob_digit[0];
+                    case -1: return -(int) ((PyLongObject*)x)->ob_digit[0];
+                }
+            }
+ #endif
+#endif
+            if (sizeof(int) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT(int, long, PyLong_AsLong)
+            } else if (sizeof(int) <= sizeof(long long)) {
+                __PYX_VERIFY_RETURN_INT(int, long long, PyLong_AsLongLong)
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            int val;
+            PyObject *v = __Pyx_PyNumber_Int(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (int) -1;
+        }
+    } else {
+        int val;
+        PyObject *tmp = __Pyx_PyNumber_Int(x);
+        if (!tmp) return (int) -1;
+        val = __Pyx_PyInt_As_int(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+}
+
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) {
+    const long neg_one = (long) -1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(long) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(long) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+        } else if (sizeof(long) <= sizeof(unsigned long long)) {
+            return PyLong_FromUnsignedLongLong((unsigned long long) value);
+        }
+    } else {
+        if (sizeof(long) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(long) <= sizeof(long long)) {
+            return PyLong_FromLongLong((long long) value);
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(long),
+                                     little, !is_unsigned);
+    }
+}
+
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      return ::std::complex< float >(x, y);
+    }
+  #else
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      return x + y*(__pyx_t_float_complex)_Complex_I;
+    }
+  #endif
+#else
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      __pyx_t_float_complex z;
+      z.real = x;
+      z.imag = y;
+      return z;
+    }
+#endif
+
+#if CYTHON_CCOMPLEX
+#else
+    static CYTHON_INLINE int __Pyx_c_eqf(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+       return (a.real == b.real) && (a.imag == b.imag);
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_sumf(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real + b.real;
+        z.imag = a.imag + b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_difff(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real - b.real;
+        z.imag = a.imag - b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_prodf(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real * b.real - a.imag * b.imag;
+        z.imag = a.real * b.imag + a.imag * b.real;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quotf(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        float denom = b.real * b.real + b.imag * b.imag;
+        z.real = (a.real * b.real + a.imag * b.imag) / denom;
+        z.imag = (a.imag * b.real - a.real * b.imag) / denom;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_negf(__pyx_t_float_complex a) {
+        __pyx_t_float_complex z;
+        z.real = -a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    static CYTHON_INLINE int __Pyx_c_is_zerof(__pyx_t_float_complex a) {
+       return (a.real == 0) && (a.imag == 0);
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_conjf(__pyx_t_float_complex a) {
+        __pyx_t_float_complex z;
+        z.real =  a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    #if 1
+        static CYTHON_INLINE float __Pyx_c_absf(__pyx_t_float_complex z) {
+          #if !defined(HAVE_HYPOT) || defined(_MSC_VER)
+            return sqrtf(z.real*z.real + z.imag*z.imag);
+          #else
+            return hypotf(z.real, z.imag);
+          #endif
+        }
+        static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_powf(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+            __pyx_t_float_complex z;
+            float r, lnr, theta, z_r, z_theta;
+            if (b.imag == 0 && b.real == (int)b.real) {
+                if (b.real < 0) {
+                    float denom = a.real * a.real + a.imag * a.imag;
+                    a.real = a.real / denom;
+                    a.imag = -a.imag / denom;
+                    b.real = -b.real;
+                }
+                switch ((int)b.real) {
+                    case 0:
+                        z.real = 1;
+                        z.imag = 0;
+                        return z;
+                    case 1:
+                        return a;
+                    case 2:
+                        z = __Pyx_c_prodf(a, a);
+                        return __Pyx_c_prodf(a, a);
+                    case 3:
+                        z = __Pyx_c_prodf(a, a);
+                        return __Pyx_c_prodf(z, a);
+                    case 4:
+                        z = __Pyx_c_prodf(a, a);
+                        return __Pyx_c_prodf(z, z);
+                }
+            }
+            if (a.imag == 0) {
+                if (a.real == 0) {
+                    return a;
+                }
+                r = a.real;
+                theta = 0;
+            } else {
+                r = __Pyx_c_absf(a);
+                theta = atan2f(a.imag, a.real);
+            }
+            lnr = logf(r);
+            z_r = expf(lnr * b.real - theta * b.imag);
+            z_theta = theta * b.real + lnr * b.imag;
+            z.real = z_r * cosf(z_theta);
+            z.imag = z_r * sinf(z_theta);
+            return z;
+        }
+    #endif
+#endif
+
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      return ::std::complex< double >(x, y);
+    }
+  #else
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      return x + y*(__pyx_t_double_complex)_Complex_I;
+    }
+  #endif
+#else
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      __pyx_t_double_complex z;
+      z.real = x;
+      z.imag = y;
+      return z;
+    }
+#endif
+
+#if CYTHON_CCOMPLEX
+#else
+    static CYTHON_INLINE int __Pyx_c_eq(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+       return (a.real == b.real) && (a.imag == b.imag);
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_sum(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real + b.real;
+        z.imag = a.imag + b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_diff(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real - b.real;
+        z.imag = a.imag - b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_prod(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real * b.real - a.imag * b.imag;
+        z.imag = a.real * b.imag + a.imag * b.real;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        double denom = b.real * b.real + b.imag * b.imag;
+        z.real = (a.real * b.real + a.imag * b.imag) / denom;
+        z.imag = (a.imag * b.real - a.real * b.imag) / denom;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_neg(__pyx_t_double_complex a) {
+        __pyx_t_double_complex z;
+        z.real = -a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    static CYTHON_INLINE int __Pyx_c_is_zero(__pyx_t_double_complex a) {
+       return (a.real == 0) && (a.imag == 0);
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_conj(__pyx_t_double_complex a) {
+        __pyx_t_double_complex z;
+        z.real =  a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    #if 1
+        static CYTHON_INLINE double __Pyx_c_abs(__pyx_t_double_complex z) {
+          #if !defined(HAVE_HYPOT) || defined(_MSC_VER)
+            return sqrt(z.real*z.real + z.imag*z.imag);
+          #else
+            return hypot(z.real, z.imag);
+          #endif
+        }
+        static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_pow(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+            __pyx_t_double_complex z;
+            double r, lnr, theta, z_r, z_theta;
+            if (b.imag == 0 && b.real == (int)b.real) {
+                if (b.real < 0) {
+                    double denom = a.real * a.real + a.imag * a.imag;
+                    a.real = a.real / denom;
+                    a.imag = -a.imag / denom;
+                    b.real = -b.real;
+                }
+                switch ((int)b.real) {
+                    case 0:
+                        z.real = 1;
+                        z.imag = 0;
+                        return z;
+                    case 1:
+                        return a;
+                    case 2:
+                        z = __Pyx_c_prod(a, a);
+                        return __Pyx_c_prod(a, a);
+                    case 3:
+                        z = __Pyx_c_prod(a, a);
+                        return __Pyx_c_prod(z, a);
+                    case 4:
+                        z = __Pyx_c_prod(a, a);
+                        return __Pyx_c_prod(z, z);
+                }
+            }
+            if (a.imag == 0) {
+                if (a.real == 0) {
+                    return a;
+                }
+                r = a.real;
+                theta = 0;
+            } else {
+                r = __Pyx_c_abs(a);
+                theta = atan2(a.imag, a.real);
+            }
+            lnr = log(r);
+            z_r = exp(lnr * b.real - theta * b.imag);
+            z_theta = theta * b.real + lnr * b.imag;
+            z.real = z_r * cos(z_theta);
+            z.imag = z_r * sin(z_theta);
+            return z;
+        }
+    #endif
+#endif
+
+#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
+ #if CYTHON_USE_PYLONG_INTERNALS
+  #include "longintrepr.h"
+ #endif
+#endif
+static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *x) {
+    const long neg_one = (long) -1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(long) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(long, long, PyInt_AS_LONG)
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                PyErr_SetString(PyExc_OverflowError,
+                                "can't convert negative value to long");
+                return (long) -1;
+            }
+            return (long) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
+ #if CYTHON_USE_PYLONG_INTERNALS
+            if (sizeof(digit) <= sizeof(long)) {
+                switch (Py_SIZE(x)) {
+                    case  0: return 0;
+                    case  1: return (long) ((PyLongObject*)x)->ob_digit[0];
+                }
+            }
+ #endif
+#endif
+            if (unlikely(Py_SIZE(x) < 0)) {
+                PyErr_SetString(PyExc_OverflowError,
+                                "can't convert negative value to long");
+                return (long) -1;
+            }
+            if (sizeof(long) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT(long, unsigned long, PyLong_AsUnsignedLong)
+            } else if (sizeof(long) <= sizeof(unsigned long long)) {
+                __PYX_VERIFY_RETURN_INT(long, unsigned long long, PyLong_AsUnsignedLongLong)
+            }
+        } else {
+#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
+ #if CYTHON_USE_PYLONG_INTERNALS
+            if (sizeof(digit) <= sizeof(long)) {
+                switch (Py_SIZE(x)) {
+                    case  0: return 0;
+                    case  1: return +(long) ((PyLongObject*)x)->ob_digit[0];
+                    case -1: return -(long) ((PyLongObject*)x)->ob_digit[0];
+                }
+            }
+ #endif
+#endif
+            if (sizeof(long) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT(long, long, PyLong_AsLong)
+            } else if (sizeof(long) <= sizeof(long long)) {
+                __PYX_VERIFY_RETURN_INT(long, long long, PyLong_AsLongLong)
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            long val;
+            PyObject *v = __Pyx_PyNumber_Int(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (long) -1;
+        }
+    } else {
+        long val;
+        PyObject *tmp = __Pyx_PyNumber_Int(x);
+        if (!tmp) return (long) -1;
+        val = __Pyx_PyInt_As_long(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+}
+
+static int __Pyx_check_binary_version(void) {
+    char ctversion[4], rtversion[4];
+    PyOS_snprintf(ctversion, 4, "%d.%d", PY_MAJOR_VERSION, PY_MINOR_VERSION);
+    PyOS_snprintf(rtversion, 4, "%s", Py_GetVersion());
+    if (ctversion[0] != rtversion[0] || ctversion[2] != rtversion[2]) {
+        char message[200];
+        PyOS_snprintf(message, sizeof(message),
+                      "compiletime version %s of module '%.100s' "
+                      "does not match runtime version %s",
+                      ctversion, __Pyx_MODULE_NAME, rtversion);
+        #if PY_VERSION_HEX < 0x02050000
+        return PyErr_Warn(NULL, message);
+        #else
+        return PyErr_WarnEx(NULL, message, 1);
+        #endif
+    }
+    return 0;
+}
+
+#ifndef __PYX_HAVE_RT_ImportModule
+#define __PYX_HAVE_RT_ImportModule
+static PyObject *__Pyx_ImportModule(const char *name) {
+    PyObject *py_name = 0;
+    PyObject *py_module = 0;
+    py_name = __Pyx_PyIdentifier_FromString(name);
+    if (!py_name)
+        goto bad;
+    py_module = PyImport_Import(py_name);
+    Py_DECREF(py_name);
+    return py_module;
+bad:
+    Py_XDECREF(py_name);
+    return 0;
+}
+#endif
+
+#ifndef __PYX_HAVE_RT_ImportType
+#define __PYX_HAVE_RT_ImportType
+static PyTypeObject *__Pyx_ImportType(const char *module_name, const char *class_name,
+    size_t size, int strict)
+{
+    PyObject *py_module = 0;
+    PyObject *result = 0;
+    PyObject *py_name = 0;
+    char warning[200];
+    Py_ssize_t basicsize;
+#ifdef Py_LIMITED_API
+    PyObject *py_basicsize;
+#endif
+    py_module = __Pyx_ImportModule(module_name);
+    if (!py_module)
+        goto bad;
+    py_name = __Pyx_PyIdentifier_FromString(class_name);
+    if (!py_name)
+        goto bad;
+    result = PyObject_GetAttr(py_module, py_name);
+    Py_DECREF(py_name);
+    py_name = 0;
+    Py_DECREF(py_module);
+    py_module = 0;
+    if (!result)
+        goto bad;
+    if (!PyType_Check(result)) {
+        PyErr_Format(PyExc_TypeError,
+            "%.200s.%.200s is not a type object",
+            module_name, class_name);
+        goto bad;
+    }
+#ifndef Py_LIMITED_API
+    basicsize = ((PyTypeObject *)result)->tp_basicsize;
+#else
+    py_basicsize = PyObject_GetAttrString(result, "__basicsize__");
+    if (!py_basicsize)
+        goto bad;
+    basicsize = PyLong_AsSsize_t(py_basicsize);
+    Py_DECREF(py_basicsize);
+    py_basicsize = 0;
+    if (basicsize == (Py_ssize_t)-1 && PyErr_Occurred())
+        goto bad;
+#endif
+    if (!strict && (size_t)basicsize > size) {
+        PyOS_snprintf(warning, sizeof(warning),
+            "%s.%s size changed, may indicate binary incompatibility",
+            module_name, class_name);
+        #if PY_VERSION_HEX < 0x02050000
+        if (PyErr_Warn(NULL, warning) < 0) goto bad;
+        #else
+        if (PyErr_WarnEx(NULL, warning, 0) < 0) goto bad;
+        #endif
+    }
+    else if ((size_t)basicsize != size) {
+        PyErr_Format(PyExc_ValueError,
+            "%.200s.%.200s has the wrong size, try recompiling",
+            module_name, class_name);
+        goto bad;
+    }
+    return (PyTypeObject *)result;
+bad:
+    Py_XDECREF(py_module);
+    Py_XDECREF(result);
+    return NULL;
+}
+#endif
+
+static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line) {
+    int start = 0, mid = 0, end = count - 1;
+    if (end >= 0 && code_line > entries[end].code_line) {
+        return count;
+    }
+    while (start < end) {
+        mid = (start + end) / 2;
+        if (code_line < entries[mid].code_line) {
+            end = mid;
+        } else if (code_line > entries[mid].code_line) {
+             start = mid + 1;
+        } else {
+            return mid;
+        }
+    }
+    if (code_line <= entries[mid].code_line) {
+        return mid;
+    } else {
+        return mid + 1;
+    }
+}
+static PyCodeObject *__pyx_find_code_object(int code_line) {
+    PyCodeObject* code_object;
+    int pos;
+    if (unlikely(!code_line) || unlikely(!__pyx_code_cache.entries)) {
+        return NULL;
+    }
+    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
+    if (unlikely(pos >= __pyx_code_cache.count) || unlikely(__pyx_code_cache.entries[pos].code_line != code_line)) {
+        return NULL;
+    }
+    code_object = __pyx_code_cache.entries[pos].code_object;
+    Py_INCREF(code_object);
+    return code_object;
+}
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object) {
+    int pos, i;
+    __Pyx_CodeObjectCacheEntry* entries = __pyx_code_cache.entries;
+    if (unlikely(!code_line)) {
+        return;
+    }
+    if (unlikely(!entries)) {
+        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Malloc(64*sizeof(__Pyx_CodeObjectCacheEntry));
+        if (likely(entries)) {
+            __pyx_code_cache.entries = entries;
+            __pyx_code_cache.max_count = 64;
+            __pyx_code_cache.count = 1;
+            entries[0].code_line = code_line;
+            entries[0].code_object = code_object;
+            Py_INCREF(code_object);
+        }
+        return;
+    }
+    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
+    if ((pos < __pyx_code_cache.count) && unlikely(__pyx_code_cache.entries[pos].code_line == code_line)) {
+        PyCodeObject* tmp = entries[pos].code_object;
+        entries[pos].code_object = code_object;
+        Py_DECREF(tmp);
+        return;
+    }
+    if (__pyx_code_cache.count == __pyx_code_cache.max_count) {
+        int new_max = __pyx_code_cache.max_count + 64;
+        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Realloc(
+            __pyx_code_cache.entries, new_max*sizeof(__Pyx_CodeObjectCacheEntry));
+        if (unlikely(!entries)) {
+            return;
+        }
+        __pyx_code_cache.entries = entries;
+        __pyx_code_cache.max_count = new_max;
+    }
+    for (i=__pyx_code_cache.count; i>pos; i--) {
+        entries[i] = entries[i-1];
+    }
+    entries[pos].code_line = code_line;
+    entries[pos].code_object = code_object;
+    __pyx_code_cache.count++;
+    Py_INCREF(code_object);
+}
+
+#include "compile.h"
+#include "frameobject.h"
+#include "traceback.h"
+static PyCodeObject* __Pyx_CreateCodeObjectForTraceback(
+            const char *funcname, int c_line,
+            int py_line, const char *filename) {
+    PyCodeObject *py_code = 0;
+    PyObject *py_srcfile = 0;
+    PyObject *py_funcname = 0;
+    #if PY_MAJOR_VERSION < 3
+    py_srcfile = PyString_FromString(filename);
+    #else
+    py_srcfile = PyUnicode_FromString(filename);
+    #endif
+    if (!py_srcfile) goto bad;
+    if (c_line) {
+        #if PY_MAJOR_VERSION < 3
+        py_funcname = PyString_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
+        #else
+        py_funcname = PyUnicode_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
+        #endif
+    }
+    else {
+        #if PY_MAJOR_VERSION < 3
+        py_funcname = PyString_FromString(funcname);
+        #else
+        py_funcname = PyUnicode_FromString(funcname);
+        #endif
+    }
+    if (!py_funcname) goto bad;
+    py_code = __Pyx_PyCode_New(
+        0,            /*int argcount,*/
+        0,            /*int kwonlyargcount,*/
+        0,            /*int nlocals,*/
+        0,            /*int stacksize,*/
+        0,            /*int flags,*/
+        __pyx_empty_bytes, /*PyObject *code,*/
+        __pyx_empty_tuple, /*PyObject *consts,*/
+        __pyx_empty_tuple, /*PyObject *names,*/
+        __pyx_empty_tuple, /*PyObject *varnames,*/
+        __pyx_empty_tuple, /*PyObject *freevars,*/
+        __pyx_empty_tuple, /*PyObject *cellvars,*/
+        py_srcfile,   /*PyObject *filename,*/
+        py_funcname,  /*PyObject *name,*/
+        py_line,      /*int firstlineno,*/
+        __pyx_empty_bytes  /*PyObject *lnotab*/
+    );
+    Py_DECREF(py_srcfile);
+    Py_DECREF(py_funcname);
+    return py_code;
+bad:
+    Py_XDECREF(py_srcfile);
+    Py_XDECREF(py_funcname);
+    return NULL;
+}
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename) {
+    PyCodeObject *py_code = 0;
+    PyObject *py_globals = 0;
+    PyFrameObject *py_frame = 0;
+    py_code = __pyx_find_code_object(c_line ? c_line : py_line);
+    if (!py_code) {
+        py_code = __Pyx_CreateCodeObjectForTraceback(
+            funcname, c_line, py_line, filename);
+        if (!py_code) goto bad;
+        __pyx_insert_code_object(c_line ? c_line : py_line, py_code);
+    }
+    py_globals = PyModule_GetDict(__pyx_m);
+    if (!py_globals) goto bad;
+    py_frame = PyFrame_New(
+        PyThreadState_GET(), /*PyThreadState *tstate,*/
+        py_code,             /*PyCodeObject *code,*/
+        py_globals,          /*PyObject *globals,*/
+        0                    /*PyObject *locals*/
+    );
+    if (!py_frame) goto bad;
+    py_frame->f_lineno = py_line;
+    PyTraceBack_Here(py_frame);
+bad:
+    Py_XDECREF(py_code);
+    Py_XDECREF(py_frame);
+}
+
+static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
+    while (t->p) {
+        #if PY_MAJOR_VERSION < 3
+        if (t->is_unicode) {
+            *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);
+        } else if (t->intern) {
+            *t->p = PyString_InternFromString(t->s);
+        } else {
+            *t->p = PyString_FromStringAndSize(t->s, t->n - 1);
+        }
+        #else  /* Python 3+ has unicode identifiers */
+        if (t->is_unicode | t->is_str) {
+            if (t->intern) {
+                *t->p = PyUnicode_InternFromString(t->s);
+            } else if (t->encoding) {
+                *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL);
+            } else {
+                *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1);
+            }
+        } else {
+            *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1);
+        }
+        #endif
+        if (!*t->p)
+            return -1;
+        ++t;
+    }
+    return 0;
+}
+
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(char* c_str) {
+    return __Pyx_PyUnicode_FromStringAndSize(c_str, strlen(c_str));
+}
+static CYTHON_INLINE char* __Pyx_PyObject_AsString(PyObject* o) {
+    Py_ssize_t ignore;
+    return __Pyx_PyObject_AsStringAndSize(o, &ignore);
+}
+static CYTHON_INLINE char* __Pyx_PyObject_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+    if (
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+            __Pyx_sys_getdefaultencoding_not_ascii &&
+#endif
+            PyUnicode_Check(o)) {
+#if PY_VERSION_HEX < 0x03030000
+        char* defenc_c;
+        PyObject* defenc = _PyUnicode_AsDefaultEncodedString(o, NULL);
+        if (!defenc) return NULL;
+        defenc_c = PyBytes_AS_STRING(defenc);
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+        {
+            char* end = defenc_c + PyBytes_GET_SIZE(defenc);
+            char* c;
+            for (c = defenc_c; c < end; c++) {
+                if ((unsigned char) (*c) >= 128) {
+                    PyUnicode_AsASCIIString(o);
+                    return NULL;
+                }
+            }
+        }
+#endif /*__PYX_DEFAULT_STRING_ENCODING_IS_ASCII*/
+        *length = PyBytes_GET_SIZE(defenc);
+        return defenc_c;
+#else /* PY_VERSION_HEX < 0x03030000 */
+        if (PyUnicode_READY(o) == -1) return NULL;
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+        if (PyUnicode_IS_ASCII(o)) {
+            *length = PyUnicode_GET_DATA_SIZE(o);
+            return PyUnicode_AsUTF8(o);
+        } else {
+            PyUnicode_AsASCIIString(o);
+            return NULL;
+        }
+#else /* __PYX_DEFAULT_STRING_ENCODING_IS_ASCII */
+        return PyUnicode_AsUTF8AndSize(o, length);
+#endif /* __PYX_DEFAULT_STRING_ENCODING_IS_ASCII */
+#endif /* PY_VERSION_HEX < 0x03030000 */
+    } else
+#endif /* __PYX_DEFAULT_STRING_ENCODING_IS_ASCII  || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT */
+#if !CYTHON_COMPILING_IN_PYPY
+#if PY_VERSION_HEX >= 0x02060000
+    if (PyByteArray_Check(o)) {
+        *length = PyByteArray_GET_SIZE(o);
+        return PyByteArray_AS_STRING(o);
+    } else
+#endif
+#endif
+    {
+        char* result;
+        int r = PyBytes_AsStringAndSize(o, &result, length);
+        if (unlikely(r < 0)) {
+            return NULL;
+        } else {
+            return result;
+        }
+    }
+}
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) {
+   int is_true = x == Py_True;
+   if (is_true | (x == Py_False) | (x == Py_None)) return is_true;
+   else return PyObject_IsTrue(x);
+}
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_Int(PyObject* x) {
+  PyNumberMethods *m;
+  const char *name = NULL;
+  PyObject *res = NULL;
+#if PY_MAJOR_VERSION < 3
+  if (PyInt_Check(x) || PyLong_Check(x))
+#else
+  if (PyLong_Check(x))
+#endif
+    return Py_INCREF(x), x;
+  m = Py_TYPE(x)->tp_as_number;
+#if PY_MAJOR_VERSION < 3
+  if (m && m->nb_int) {
+    name = "int";
+    res = PyNumber_Int(x);
+  }
+  else if (m && m->nb_long) {
+    name = "long";
+    res = PyNumber_Long(x);
+  }
+#else
+  if (m && m->nb_int) {
+    name = "int";
+    res = PyNumber_Long(x);
+  }
+#endif
+  if (res) {
+#if PY_MAJOR_VERSION < 3
+    if (!PyInt_Check(res) && !PyLong_Check(res)) {
+#else
+    if (!PyLong_Check(res)) {
+#endif
+      PyErr_Format(PyExc_TypeError,
+                   "__%.4s__ returned non-%.4s (type %.200s)",
+                   name, name, Py_TYPE(res)->tp_name);
+      Py_DECREF(res);
+      return NULL;
+    }
+  }
+  else if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_TypeError,
+                    "an integer is required");
+  }
+  return res;
+}
+#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
+ #if CYTHON_USE_PYLONG_INTERNALS
+  #include "longintrepr.h"
+ #endif
+#endif
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) {
+  Py_ssize_t ival;
+  PyObject *x;
+#if PY_MAJOR_VERSION < 3
+  if (likely(PyInt_CheckExact(b)))
+      return PyInt_AS_LONG(b);
+#endif
+  if (likely(PyLong_CheckExact(b))) {
+    #if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
+     #if CYTHON_USE_PYLONG_INTERNALS
+       switch (Py_SIZE(b)) {
+       case -1: return -(sdigit)((PyLongObject*)b)->ob_digit[0];
+       case  0: return 0;
+       case  1: return ((PyLongObject*)b)->ob_digit[0];
+       }
+     #endif
+    #endif
+  #if PY_VERSION_HEX < 0x02060000
+    return PyInt_AsSsize_t(b);
+  #else
+    return PyLong_AsSsize_t(b);
+  #endif
+  }
+  x = PyNumber_Index(b);
+  if (!x) return -1;
+  ival = PyInt_AsSsize_t(x);
+  Py_DECREF(x);
+  return ival;
+}
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) {
+#if PY_VERSION_HEX < 0x02050000
+   if (ival <= LONG_MAX)
+       return PyInt_FromLong((long)ival);
+   else {
+       unsigned char *bytes = (unsigned char *) &ival;
+       int one = 1; int little = (int)*(unsigned char*)&one;
+       return _PyLong_FromByteArray(bytes, sizeof(size_t), little, 0);
+   }
+#else
+   return PyInt_FromSize_t(ival);
+#endif
+}
+
+
+#endif /* Py_PYTHON_H */
diff --git a/src/tools/voc_eval_lib/nms/cpu_nms.pyx b/src/tools/voc_eval_lib/nms/cpu_nms.pyx
new file mode 100644
index 0000000..1d0bef3
--- /dev/null
+++ b/src/tools/voc_eval_lib/nms/cpu_nms.pyx
@@ -0,0 +1,68 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import numpy as np
+cimport numpy as np
+
+cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
+    return a if a >= b else b
+
+cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
+    return a if a <= b else b
+
+def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
+    cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
+    cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
+    cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
+    cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
+    cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
+
+    cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
+
+    cdef int ndets = dets.shape[0]
+    cdef np.ndarray[np.int_t, ndim=1] suppressed = \
+            np.zeros((ndets), dtype=np.int)
+
+    # nominal indices
+    cdef int _i, _j
+    # sorted indices
+    cdef int i, j
+    # temp variables for box i's (the box currently under consideration)
+    cdef np.float32_t ix1, iy1, ix2, iy2, iarea
+    # variables for computing overlap with box j (lower scoring box)
+    cdef np.float32_t xx1, yy1, xx2, yy2
+    cdef np.float32_t w, h
+    cdef np.float32_t inter, ovr
+
+    keep = []
+    for _i in range(ndets):
+        i = order[_i]
+        if suppressed[i] == 1:
+            continue
+        keep.append(i)
+        ix1 = x1[i]
+        iy1 = y1[i]
+        ix2 = x2[i]
+        iy2 = y2[i]
+        iarea = areas[i]
+        for _j in range(_i + 1, ndets):
+            j = order[_j]
+            if suppressed[j] == 1:
+                continue
+            xx1 = max(ix1, x1[j])
+            yy1 = max(iy1, y1[j])
+            xx2 = min(ix2, x2[j])
+            yy2 = min(iy2, y2[j])
+            w = max(0.0, xx2 - xx1 + 1)
+            h = max(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (iarea + areas[j] - inter)
+            if ovr >= thresh:
+                suppressed[j] = 1
+
+    return keep
diff --git a/src/tools/voc_eval_lib/nms/gpu_nms.cpp b/src/tools/voc_eval_lib/nms/gpu_nms.cpp
new file mode 100644
index 0000000..b6fdcea
--- /dev/null
+++ b/src/tools/voc_eval_lib/nms/gpu_nms.cpp
@@ -0,0 +1,6391 @@
+/* Generated by Cython 0.20.1 on Wed Oct  5 13:15:30 2016 */
+
+#define PY_SSIZE_T_CLEAN
+#ifndef CYTHON_USE_PYLONG_INTERNALS
+#ifdef PYLONG_BITS_IN_DIGIT
+#define CYTHON_USE_PYLONG_INTERNALS 0
+#else
+#include "pyconfig.h"
+#ifdef PYLONG_BITS_IN_DIGIT
+#define CYTHON_USE_PYLONG_INTERNALS 1
+#else
+#define CYTHON_USE_PYLONG_INTERNALS 0
+#endif
+#endif
+#endif
+#include "Python.h"
+#ifndef Py_PYTHON_H
+    #error Python headers needed to compile C extensions, please install development version of Python.
+#elif PY_VERSION_HEX < 0x02040000
+    #error Cython requires Python 2.4+.
+#else
+#define CYTHON_ABI "0_20_1"
+#include <stddef.h> /* For offsetof */
+#ifndef offsetof
+#define offsetof(type, member) ( (size_t) & ((type*)0) -> member )
+#endif
+#if !defined(WIN32) && !defined(MS_WINDOWS)
+  #ifndef __stdcall
+    #define __stdcall
+  #endif
+  #ifndef __cdecl
+    #define __cdecl
+  #endif
+  #ifndef __fastcall
+    #define __fastcall
+  #endif
+#endif
+#ifndef DL_IMPORT
+  #define DL_IMPORT(t) t
+#endif
+#ifndef DL_EXPORT
+  #define DL_EXPORT(t) t
+#endif
+#ifndef PY_LONG_LONG
+  #define PY_LONG_LONG LONG_LONG
+#endif
+#ifndef Py_HUGE_VAL
+  #define Py_HUGE_VAL HUGE_VAL
+#endif
+#ifdef PYPY_VERSION
+#define CYTHON_COMPILING_IN_PYPY 1
+#define CYTHON_COMPILING_IN_CPYTHON 0
+#else
+#define CYTHON_COMPILING_IN_PYPY 0
+#define CYTHON_COMPILING_IN_CPYTHON 1
+#endif
+#if CYTHON_COMPILING_IN_PYPY
+#define Py_OptimizeFlag 0
+#endif
+#if PY_VERSION_HEX < 0x02050000
+  typedef int Py_ssize_t;
+  #define PY_SSIZE_T_MAX INT_MAX
+  #define PY_SSIZE_T_MIN INT_MIN
+  #define PY_FORMAT_SIZE_T ""
+  #define CYTHON_FORMAT_SSIZE_T ""
+  #define PyInt_FromSsize_t(z) PyInt_FromLong(z)
+  #define PyInt_AsSsize_t(o)   __Pyx_PyInt_As_int(o)
+  #define PyNumber_Index(o)    ((PyNumber_Check(o) && !PyFloat_Check(o)) ? PyNumber_Int(o) : \
+                                (PyErr_Format(PyExc_TypeError, \
+                                              "expected index value, got %.200s", Py_TYPE(o)->tp_name), \
+                                 (PyObject*)0))
+  #define __Pyx_PyIndex_Check(o) (PyNumber_Check(o) && !PyFloat_Check(o) && \
+                                  !PyComplex_Check(o))
+  #define PyIndex_Check __Pyx_PyIndex_Check
+  #define PyErr_WarnEx(category, message, stacklevel) PyErr_Warn(category, message)
+  #define __PYX_BUILD_PY_SSIZE_T "i"
+#else
+  #define __PYX_BUILD_PY_SSIZE_T "n"
+  #define CYTHON_FORMAT_SSIZE_T "z"
+  #define __Pyx_PyIndex_Check PyIndex_Check
+#endif
+#if PY_VERSION_HEX < 0x02060000
+  #define Py_REFCNT(ob) (((PyObject*)(ob))->ob_refcnt)
+  #define Py_TYPE(ob)   (((PyObject*)(ob))->ob_type)
+  #define Py_SIZE(ob)   (((PyVarObject*)(ob))->ob_size)
+  #define PyVarObject_HEAD_INIT(type, size) \
+          PyObject_HEAD_INIT(type) size,
+  #define PyType_Modified(t)
+  typedef struct {
+     void *buf;
+     PyObject *obj;
+     Py_ssize_t len;
+     Py_ssize_t itemsize;
+     int readonly;
+     int ndim;
+     char *format;
+     Py_ssize_t *shape;
+     Py_ssize_t *strides;
+     Py_ssize_t *suboffsets;
+     void *internal;
+  } Py_buffer;
+  #define PyBUF_SIMPLE 0
+  #define PyBUF_WRITABLE 0x0001
+  #define PyBUF_FORMAT 0x0004
+  #define PyBUF_ND 0x0008
+  #define PyBUF_STRIDES (0x0010 | PyBUF_ND)
+  #define PyBUF_C_CONTIGUOUS (0x0020 | PyBUF_STRIDES)
+  #define PyBUF_F_CONTIGUOUS (0x0040 | PyBUF_STRIDES)
+  #define PyBUF_ANY_CONTIGUOUS (0x0080 | PyBUF_STRIDES)
+  #define PyBUF_INDIRECT (0x0100 | PyBUF_STRIDES)
+  #define PyBUF_RECORDS (PyBUF_STRIDES | PyBUF_FORMAT | PyBUF_WRITABLE)
+  #define PyBUF_FULL (PyBUF_INDIRECT | PyBUF_FORMAT | PyBUF_WRITABLE)
+  typedef int (*getbufferproc)(PyObject *, Py_buffer *, int);
+  typedef void (*releasebufferproc)(PyObject *, Py_buffer *);
+#endif
+#if PY_MAJOR_VERSION < 3
+  #define __Pyx_BUILTIN_MODULE_NAME "__builtin__"
+  #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos) \
+          PyCode_New(a+k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+  #define __Pyx_DefaultClassType PyClass_Type
+#else
+  #define __Pyx_BUILTIN_MODULE_NAME "builtins"
+  #define __Pyx_PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos) \
+          PyCode_New(a, k, l, s, f, code, c, n, v, fv, cell, fn, name, fline, lnos)
+  #define __Pyx_DefaultClassType PyType_Type
+#endif
+#if PY_VERSION_HEX < 0x02060000
+  #define PyUnicode_FromString(s) PyUnicode_Decode(s, strlen(s), "UTF-8", "strict")
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define Py_TPFLAGS_CHECKTYPES 0
+  #define Py_TPFLAGS_HAVE_INDEX 0
+#endif
+#if (PY_VERSION_HEX < 0x02060000) || (PY_MAJOR_VERSION >= 3)
+  #define Py_TPFLAGS_HAVE_NEWBUFFER 0
+#endif
+#if PY_VERSION_HEX < 0x02060000
+  #define Py_TPFLAGS_HAVE_VERSION_TAG 0
+#endif
+#if PY_VERSION_HEX < 0x02060000 && !defined(Py_TPFLAGS_IS_ABSTRACT)
+  #define Py_TPFLAGS_IS_ABSTRACT 0
+#endif
+#if PY_VERSION_HEX < 0x030400a1 && !defined(Py_TPFLAGS_HAVE_FINALIZE)
+  #define Py_TPFLAGS_HAVE_FINALIZE 0
+#endif
+#if PY_VERSION_HEX > 0x03030000 && defined(PyUnicode_KIND)
+  #define CYTHON_PEP393_ENABLED 1
+  #define __Pyx_PyUnicode_READY(op)       (likely(PyUnicode_IS_READY(op)) ? \
+                                              0 : _PyUnicode_Ready((PyObject *)(op)))
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_LENGTH(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) PyUnicode_READ_CHAR(u, i)
+  #define __Pyx_PyUnicode_KIND(u)         PyUnicode_KIND(u)
+  #define __Pyx_PyUnicode_DATA(u)         PyUnicode_DATA(u)
+  #define __Pyx_PyUnicode_READ(k, d, i)   PyUnicode_READ(k, d, i)
+#else
+  #define CYTHON_PEP393_ENABLED 0
+  #define __Pyx_PyUnicode_READY(op)       (0)
+  #define __Pyx_PyUnicode_GET_LENGTH(u)   PyUnicode_GET_SIZE(u)
+  #define __Pyx_PyUnicode_READ_CHAR(u, i) ((Py_UCS4)(PyUnicode_AS_UNICODE(u)[i]))
+  #define __Pyx_PyUnicode_KIND(u)         (sizeof(Py_UNICODE))
+  #define __Pyx_PyUnicode_DATA(u)         ((void*)PyUnicode_AS_UNICODE(u))
+  #define __Pyx_PyUnicode_READ(k, d, i)   ((void)(k), (Py_UCS4)(((Py_UNICODE*)d)[i]))
+#endif
+#if CYTHON_COMPILING_IN_PYPY
+  #define __Pyx_PyUnicode_Concat(a, b)      PyNumber_Add(a, b)
+  #define __Pyx_PyUnicode_ConcatSafe(a, b)  PyNumber_Add(a, b)
+#else
+  #define __Pyx_PyUnicode_Concat(a, b)      PyUnicode_Concat(a, b)
+  #define __Pyx_PyUnicode_ConcatSafe(a, b)  ((unlikely((a) == Py_None) || unlikely((b) == Py_None)) ? \
+      PyNumber_Add(a, b) : __Pyx_PyUnicode_Concat(a, b))
+#endif
+#define __Pyx_PyString_FormatSafe(a, b)  ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : __Pyx_PyString_Format(a, b))
+#define __Pyx_PyUnicode_FormatSafe(a, b)  ((unlikely((a) == Py_None)) ? PyNumber_Remainder(a, b) : PyUnicode_Format(a, b))
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyString_Format(a, b)  PyUnicode_Format(a, b)
+#else
+  #define __Pyx_PyString_Format(a, b)  PyString_Format(a, b)
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyBaseString_Type            PyUnicode_Type
+  #define PyStringObject               PyUnicodeObject
+  #define PyString_Type                PyUnicode_Type
+  #define PyString_Check               PyUnicode_Check
+  #define PyString_CheckExact          PyUnicode_CheckExact
+#endif
+#if PY_VERSION_HEX < 0x02060000
+  #define PyBytesObject                PyStringObject
+  #define PyBytes_Type                 PyString_Type
+  #define PyBytes_Check                PyString_Check
+  #define PyBytes_CheckExact           PyString_CheckExact
+  #define PyBytes_FromString           PyString_FromString
+  #define PyBytes_FromStringAndSize    PyString_FromStringAndSize
+  #define PyBytes_FromFormat           PyString_FromFormat
+  #define PyBytes_DecodeEscape         PyString_DecodeEscape
+  #define PyBytes_AsString             PyString_AsString
+  #define PyBytes_AsStringAndSize      PyString_AsStringAndSize
+  #define PyBytes_Size                 PyString_Size
+  #define PyBytes_AS_STRING            PyString_AS_STRING
+  #define PyBytes_GET_SIZE             PyString_GET_SIZE
+  #define PyBytes_Repr                 PyString_Repr
+  #define PyBytes_Concat               PyString_Concat
+  #define PyBytes_ConcatAndDel         PyString_ConcatAndDel
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyBaseString_Check(obj) PyUnicode_Check(obj)
+  #define __Pyx_PyBaseString_CheckExact(obj) PyUnicode_CheckExact(obj)
+#else
+  #define __Pyx_PyBaseString_Check(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj) || \
+                                         PyString_Check(obj) || PyUnicode_Check(obj))
+  #define __Pyx_PyBaseString_CheckExact(obj) (PyString_CheckExact(obj) || PyUnicode_CheckExact(obj))
+#endif
+#if PY_VERSION_HEX < 0x02060000
+  #define PySet_Check(obj)             PyObject_TypeCheck(obj, &PySet_Type)
+  #define PyFrozenSet_Check(obj)       PyObject_TypeCheck(obj, &PyFrozenSet_Type)
+#endif
+#ifndef PySet_CheckExact
+  #define PySet_CheckExact(obj)        (Py_TYPE(obj) == &PySet_Type)
+#endif
+#define __Pyx_TypeCheck(obj, type) PyObject_TypeCheck(obj, (PyTypeObject *)type)
+#if PY_MAJOR_VERSION >= 3
+  #define PyIntObject                  PyLongObject
+  #define PyInt_Type                   PyLong_Type
+  #define PyInt_Check(op)              PyLong_Check(op)
+  #define PyInt_CheckExact(op)         PyLong_CheckExact(op)
+  #define PyInt_FromString             PyLong_FromString
+  #define PyInt_FromUnicode            PyLong_FromUnicode
+  #define PyInt_FromLong               PyLong_FromLong
+  #define PyInt_FromSize_t             PyLong_FromSize_t
+  #define PyInt_FromSsize_t            PyLong_FromSsize_t
+  #define PyInt_AsLong                 PyLong_AsLong
+  #define PyInt_AS_LONG                PyLong_AS_LONG
+  #define PyInt_AsSsize_t              PyLong_AsSsize_t
+  #define PyInt_AsUnsignedLongMask     PyLong_AsUnsignedLongMask
+  #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask
+  #define PyNumber_Int                 PyNumber_Long
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyBoolObject                 PyLongObject
+#endif
+#if PY_VERSION_HEX < 0x030200A4
+  typedef long Py_hash_t;
+  #define __Pyx_PyInt_FromHash_t PyInt_FromLong
+  #define __Pyx_PyInt_AsHash_t   PyInt_AsLong
+#else
+  #define __Pyx_PyInt_FromHash_t PyInt_FromSsize_t
+  #define __Pyx_PyInt_AsHash_t   PyInt_AsSsize_t
+#endif
+#if (PY_MAJOR_VERSION < 3) || (PY_VERSION_HEX >= 0x03010300)
+  #define __Pyx_PySequence_GetSlice(obj, a, b) PySequence_GetSlice(obj, a, b)
+  #define __Pyx_PySequence_SetSlice(obj, a, b, value) PySequence_SetSlice(obj, a, b, value)
+  #define __Pyx_PySequence_DelSlice(obj, a, b) PySequence_DelSlice(obj, a, b)
+#else
+  #define __Pyx_PySequence_GetSlice(obj, a, b) (unlikely(!(obj)) ? \
+        (PyErr_SetString(PyExc_SystemError, "null argument to internal routine"), (PyObject*)0) : \
+        (likely((obj)->ob_type->tp_as_mapping) ? (PySequence_GetSlice(obj, a, b)) : \
+            (PyErr_Format(PyExc_TypeError, "'%.200s' object is unsliceable", (obj)->ob_type->tp_name), (PyObject*)0)))
+  #define __Pyx_PySequence_SetSlice(obj, a, b, value) (unlikely(!(obj)) ? \
+        (PyErr_SetString(PyExc_SystemError, "null argument to internal routine"), -1) : \
+        (likely((obj)->ob_type->tp_as_mapping) ? (PySequence_SetSlice(obj, a, b, value)) : \
+            (PyErr_Format(PyExc_TypeError, "'%.200s' object doesn't support slice assignment", (obj)->ob_type->tp_name), -1)))
+  #define __Pyx_PySequence_DelSlice(obj, a, b) (unlikely(!(obj)) ? \
+        (PyErr_SetString(PyExc_SystemError, "null argument to internal routine"), -1) : \
+        (likely((obj)->ob_type->tp_as_mapping) ? (PySequence_DelSlice(obj, a, b)) : \
+            (PyErr_Format(PyExc_TypeError, "'%.200s' object doesn't support slice deletion", (obj)->ob_type->tp_name), -1)))
+#endif
+#if PY_MAJOR_VERSION >= 3
+  #define PyMethod_New(func, self, klass) ((self) ? PyMethod_New(func, self) : PyInstanceMethod_New(func))
+#endif
+#if PY_VERSION_HEX < 0x02050000
+  #define __Pyx_GetAttrString(o,n)   PyObject_GetAttrString((o),((char *)(n)))
+  #define __Pyx_SetAttrString(o,n,a) PyObject_SetAttrString((o),((char *)(n)),(a))
+  #define __Pyx_DelAttrString(o,n)   PyObject_DelAttrString((o),((char *)(n)))
+#else
+  #define __Pyx_GetAttrString(o,n)   PyObject_GetAttrString((o),(n))
+  #define __Pyx_SetAttrString(o,n,a) PyObject_SetAttrString((o),(n),(a))
+  #define __Pyx_DelAttrString(o,n)   PyObject_DelAttrString((o),(n))
+#endif
+#if PY_VERSION_HEX < 0x02050000
+  #define __Pyx_NAMESTR(n) ((char *)(n))
+  #define __Pyx_DOCSTR(n)  ((char *)(n))
+#else
+  #define __Pyx_NAMESTR(n) (n)
+  #define __Pyx_DOCSTR(n)  (n)
+#endif
+#ifndef CYTHON_INLINE
+  #if defined(__GNUC__)
+    #define CYTHON_INLINE __inline__
+  #elif defined(_MSC_VER)
+    #define CYTHON_INLINE __inline
+  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define CYTHON_INLINE inline
+  #else
+    #define CYTHON_INLINE
+  #endif
+#endif
+#ifndef CYTHON_RESTRICT
+  #if defined(__GNUC__)
+    #define CYTHON_RESTRICT __restrict__
+  #elif defined(_MSC_VER) && _MSC_VER >= 1400
+    #define CYTHON_RESTRICT __restrict
+  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+    #define CYTHON_RESTRICT restrict
+  #else
+    #define CYTHON_RESTRICT
+  #endif
+#endif
+#ifdef NAN
+#define __PYX_NAN() ((float) NAN)
+#else
+static CYTHON_INLINE float __PYX_NAN() {
+  /* Initialize NaN. The sign is irrelevant, an exponent with all bits 1 and
+   a nonzero mantissa means NaN. If the first bit in the mantissa is 1, it is
+   a quiet NaN. */
+  float value;
+  memset(&value, 0xFF, sizeof(value));
+  return value;
+}
+#endif
+
+
+#if PY_MAJOR_VERSION >= 3
+  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_TrueDivide(x,y)
+  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceTrueDivide(x,y)
+#else
+  #define __Pyx_PyNumber_Divide(x,y)         PyNumber_Divide(x,y)
+  #define __Pyx_PyNumber_InPlaceDivide(x,y)  PyNumber_InPlaceDivide(x,y)
+#endif
+
+#ifndef __PYX_EXTERN_C
+  #ifdef __cplusplus
+    #define __PYX_EXTERN_C extern "C"
+  #else
+    #define __PYX_EXTERN_C extern
+  #endif
+#endif
+
+#if defined(WIN32) || defined(MS_WINDOWS)
+#define _USE_MATH_DEFINES
+#endif
+#include <math.h>
+#define __PYX_HAVE__nms__gpu_nms
+#define __PYX_HAVE_API__nms__gpu_nms
+#include "string.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "gpu_nms.hpp"
+#ifdef _OPENMP
+#include <omp.h>
+#endif /* _OPENMP */
+
+#ifdef PYREX_WITHOUT_ASSERTIONS
+#define CYTHON_WITHOUT_ASSERTIONS
+#endif
+
+#ifndef CYTHON_UNUSED
+# if defined(__GNUC__)
+#   if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+#     define CYTHON_UNUSED __attribute__ ((__unused__))
+#   else
+#     define CYTHON_UNUSED
+#   endif
+# elif defined(__ICC) || (defined(__INTEL_COMPILER) && !defined(_MSC_VER))
+#   define CYTHON_UNUSED __attribute__ ((__unused__))
+# else
+#   define CYTHON_UNUSED
+# endif
+#endif
+typedef struct {PyObject **p; char *s; const Py_ssize_t n; const char* encoding;
+                const char is_unicode; const char is_str; const char intern; } __Pyx_StringTabEntry; /*proto*/
+
+#define __PYX_DEFAULT_STRING_ENCODING_IS_ASCII 0
+#define __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT 0
+#define __PYX_DEFAULT_STRING_ENCODING ""
+#define __Pyx_PyObject_FromString __Pyx_PyBytes_FromString
+#define __Pyx_PyObject_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#define __Pyx_fits_Py_ssize_t(v, type, is_signed)  (    \
+    (sizeof(type) < sizeof(Py_ssize_t))  ||             \
+    (sizeof(type) > sizeof(Py_ssize_t) &&               \
+          likely(v < (type)PY_SSIZE_T_MAX ||            \
+                 v == (type)PY_SSIZE_T_MAX)  &&         \
+          (!is_signed || likely(v > (type)PY_SSIZE_T_MIN ||       \
+                                v == (type)PY_SSIZE_T_MIN)))  ||  \
+    (sizeof(type) == sizeof(Py_ssize_t) &&              \
+          (is_signed || likely(v < (type)PY_SSIZE_T_MAX ||        \
+                               v == (type)PY_SSIZE_T_MAX)))  )
+static CYTHON_INLINE char* __Pyx_PyObject_AsString(PyObject*);
+static CYTHON_INLINE char* __Pyx_PyObject_AsStringAndSize(PyObject*, Py_ssize_t* length);
+#define __Pyx_PyByteArray_FromString(s) PyByteArray_FromStringAndSize((const char*)s, strlen((const char*)s))
+#define __Pyx_PyByteArray_FromStringAndSize(s, l) PyByteArray_FromStringAndSize((const char*)s, l)
+#define __Pyx_PyBytes_FromString        PyBytes_FromString
+#define __Pyx_PyBytes_FromStringAndSize PyBytes_FromStringAndSize
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(char*);
+#if PY_MAJOR_VERSION < 3
+    #define __Pyx_PyStr_FromString        __Pyx_PyBytes_FromString
+    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyBytes_FromStringAndSize
+#else
+    #define __Pyx_PyStr_FromString        __Pyx_PyUnicode_FromString
+    #define __Pyx_PyStr_FromStringAndSize __Pyx_PyUnicode_FromStringAndSize
+#endif
+#define __Pyx_PyObject_AsSString(s)    ((signed char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_AsUString(s)    ((unsigned char*) __Pyx_PyObject_AsString(s))
+#define __Pyx_PyObject_FromUString(s)  __Pyx_PyObject_FromString((char*)s)
+#define __Pyx_PyBytes_FromUString(s)   __Pyx_PyBytes_FromString((char*)s)
+#define __Pyx_PyByteArray_FromUString(s)   __Pyx_PyByteArray_FromString((char*)s)
+#define __Pyx_PyStr_FromUString(s)     __Pyx_PyStr_FromString((char*)s)
+#define __Pyx_PyUnicode_FromUString(s) __Pyx_PyUnicode_FromString((char*)s)
+#if PY_MAJOR_VERSION < 3
+static CYTHON_INLINE size_t __Pyx_Py_UNICODE_strlen(const Py_UNICODE *u)
+{
+    const Py_UNICODE *u_end = u;
+    while (*u_end++) ;
+    return u_end - u - 1;
+}
+#else
+#define __Pyx_Py_UNICODE_strlen Py_UNICODE_strlen
+#endif
+#define __Pyx_PyUnicode_FromUnicode(u)       PyUnicode_FromUnicode(u, __Pyx_Py_UNICODE_strlen(u))
+#define __Pyx_PyUnicode_FromUnicodeAndLength PyUnicode_FromUnicode
+#define __Pyx_PyUnicode_AsUnicode            PyUnicode_AsUnicode
+#define __Pyx_Owned_Py_None(b) (Py_INCREF(Py_None), Py_None)
+#define __Pyx_PyBool_FromLong(b) ((b) ? (Py_INCREF(Py_True), Py_True) : (Py_INCREF(Py_False), Py_False))
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*);
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_Int(PyObject* x);
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject*);
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t);
+#if CYTHON_COMPILING_IN_CPYTHON
+#define __pyx_PyFloat_AsDouble(x) (PyFloat_CheckExact(x) ? PyFloat_AS_DOUBLE(x) : PyFloat_AsDouble(x))
+#else
+#define __pyx_PyFloat_AsDouble(x) PyFloat_AsDouble(x)
+#endif
+#define __pyx_PyFloat_AsFloat(x) ((float) __pyx_PyFloat_AsDouble(x))
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+static int __Pyx_sys_getdefaultencoding_not_ascii;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+    PyObject* sys = NULL;
+    PyObject* default_encoding = NULL;
+    PyObject* ascii_chars_u = NULL;
+    PyObject* ascii_chars_b = NULL;
+    sys = PyImport_ImportModule("sys");
+    if (sys == NULL) goto bad;
+    default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL);
+    if (default_encoding == NULL) goto bad;
+    if (strcmp(PyBytes_AsString(default_encoding), "ascii") == 0) {
+        __Pyx_sys_getdefaultencoding_not_ascii = 0;
+    } else {
+        const char* default_encoding_c = PyBytes_AS_STRING(default_encoding);
+        char ascii_chars[128];
+        int c;
+        for (c = 0; c < 128; c++) {
+            ascii_chars[c] = c;
+        }
+        __Pyx_sys_getdefaultencoding_not_ascii = 1;
+        ascii_chars_u = PyUnicode_DecodeASCII(ascii_chars, 128, NULL);
+        if (ascii_chars_u == NULL) goto bad;
+        ascii_chars_b = PyUnicode_AsEncodedString(ascii_chars_u, default_encoding_c, NULL);
+        if (ascii_chars_b == NULL || strncmp(ascii_chars, PyBytes_AS_STRING(ascii_chars_b), 128) != 0) {
+            PyErr_Format(
+                PyExc_ValueError,
+                "This module compiled with c_string_encoding=ascii, but default encoding '%.200s' is not a superset of ascii.",
+                default_encoding_c);
+            goto bad;
+        }
+    }
+    Py_XDECREF(sys);
+    Py_XDECREF(default_encoding);
+    Py_XDECREF(ascii_chars_u);
+    Py_XDECREF(ascii_chars_b);
+    return 0;
+bad:
+    Py_XDECREF(sys);
+    Py_XDECREF(default_encoding);
+    Py_XDECREF(ascii_chars_u);
+    Py_XDECREF(ascii_chars_b);
+    return -1;
+}
+#endif
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT && PY_MAJOR_VERSION >= 3
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_DecodeUTF8(c_str, size, NULL)
+#else
+#define __Pyx_PyUnicode_FromStringAndSize(c_str, size) PyUnicode_Decode(c_str, size, __PYX_DEFAULT_STRING_ENCODING, NULL)
+#if __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+static char* __PYX_DEFAULT_STRING_ENCODING;
+static int __Pyx_init_sys_getdefaultencoding_params(void) {
+    PyObject* sys = NULL;
+    PyObject* default_encoding = NULL;
+    char* default_encoding_c;
+    sys = PyImport_ImportModule("sys");
+    if (sys == NULL) goto bad;
+    default_encoding = PyObject_CallMethod(sys, (char*) (const char*) "getdefaultencoding", NULL);
+    if (default_encoding == NULL) goto bad;
+    default_encoding_c = PyBytes_AS_STRING(default_encoding);
+    __PYX_DEFAULT_STRING_ENCODING = (char*) malloc(strlen(default_encoding_c));
+    strcpy(__PYX_DEFAULT_STRING_ENCODING, default_encoding_c);
+    Py_DECREF(sys);
+    Py_DECREF(default_encoding);
+    return 0;
+bad:
+    Py_XDECREF(sys);
+    Py_XDECREF(default_encoding);
+    return -1;
+}
+#endif
+#endif
+
+
+#ifdef __GNUC__
+  /* Test for GCC > 2.95 */
+  #if __GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95))
+    #define likely(x)   __builtin_expect(!!(x), 1)
+    #define unlikely(x) __builtin_expect(!!(x), 0)
+  #else /* __GNUC__ > 2 ... */
+    #define likely(x)   (x)
+    #define unlikely(x) (x)
+  #endif /* __GNUC__ > 2 ... */
+#else /* __GNUC__ */
+  #define likely(x)   (x)
+  #define unlikely(x) (x)
+#endif /* __GNUC__ */
+
+static PyObject *__pyx_m;
+static PyObject *__pyx_d;
+static PyObject *__pyx_b;
+static PyObject *__pyx_empty_tuple;
+static PyObject *__pyx_empty_bytes;
+static int __pyx_lineno;
+static int __pyx_clineno = 0;
+static const char * __pyx_cfilenm= __FILE__;
+static const char *__pyx_filename;
+
+#if !defined(CYTHON_CCOMPLEX)
+  #if defined(__cplusplus)
+    #define CYTHON_CCOMPLEX 1
+  #elif defined(_Complex_I)
+    #define CYTHON_CCOMPLEX 1
+  #else
+    #define CYTHON_CCOMPLEX 0
+  #endif
+#endif
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    #include <complex>
+  #else
+    #include <complex.h>
+  #endif
+#endif
+#if CYTHON_CCOMPLEX && !defined(__cplusplus) && defined(__sun__) && defined(__GNUC__)
+  #undef _Complex_I
+  #define _Complex_I 1.0fj
+#endif
+
+
+static const char *__pyx_f[] = {
+  "gpu_nms.pyx",
+  "__init__.pxd",
+  "type.pxd",
+};
+#define IS_UNSIGNED(type) (((type) -1) > 0)
+struct __Pyx_StructField_;
+#define __PYX_BUF_FLAGS_PACKED_STRUCT (1 << 0)
+typedef struct {
+  const char* name; /* for error messages only */
+  struct __Pyx_StructField_* fields;
+  size_t size;     /* sizeof(type) */
+  size_t arraysize[8]; /* length of array in each dimension */
+  int ndim;
+  char typegroup; /* _R_eal, _C_omplex, Signed _I_nt, _U_nsigned int, _S_truct, _P_ointer, _O_bject, c_H_ar */
+  char is_unsigned;
+  int flags;
+} __Pyx_TypeInfo;
+typedef struct __Pyx_StructField_ {
+  __Pyx_TypeInfo* type;
+  const char* name;
+  size_t offset;
+} __Pyx_StructField;
+typedef struct {
+  __Pyx_StructField* field;
+  size_t parent_offset;
+} __Pyx_BufFmt_StackElem;
+typedef struct {
+  __Pyx_StructField root;
+  __Pyx_BufFmt_StackElem* head;
+  size_t fmt_offset;
+  size_t new_count, enc_count;
+  size_t struct_alignment;
+  int is_complex;
+  char enc_type;
+  char new_packmode;
+  char enc_packmode;
+  char is_valid_array;
+} __Pyx_BufFmt_Context;
+
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":723
+ * # in Cython to enable them only on the right systems.
+ * 
+ * ctypedef npy_int8       int8_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t
+ */
+typedef npy_int8 __pyx_t_5numpy_int8_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":724
+ * 
+ * ctypedef npy_int8       int8_t
+ * ctypedef npy_int16      int16_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int32      int32_t
+ * ctypedef npy_int64      int64_t
+ */
+typedef npy_int16 __pyx_t_5numpy_int16_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":725
+ * ctypedef npy_int8       int8_t
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_int64      int64_t
+ * #ctypedef npy_int96      int96_t
+ */
+typedef npy_int32 __pyx_t_5numpy_int32_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":726
+ * ctypedef npy_int16      int16_t
+ * ctypedef npy_int32      int32_t
+ * ctypedef npy_int64      int64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_int96      int96_t
+ * #ctypedef npy_int128     int128_t
+ */
+typedef npy_int64 __pyx_t_5numpy_int64_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":730
+ * #ctypedef npy_int128     int128_t
+ * 
+ * ctypedef npy_uint8      uint8_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t
+ */
+typedef npy_uint8 __pyx_t_5numpy_uint8_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":731
+ * 
+ * ctypedef npy_uint8      uint8_t
+ * ctypedef npy_uint16     uint16_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint32     uint32_t
+ * ctypedef npy_uint64     uint64_t
+ */
+typedef npy_uint16 __pyx_t_5numpy_uint16_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":732
+ * ctypedef npy_uint8      uint8_t
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uint64     uint64_t
+ * #ctypedef npy_uint96     uint96_t
+ */
+typedef npy_uint32 __pyx_t_5numpy_uint32_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":733
+ * ctypedef npy_uint16     uint16_t
+ * ctypedef npy_uint32     uint32_t
+ * ctypedef npy_uint64     uint64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_uint96     uint96_t
+ * #ctypedef npy_uint128    uint128_t
+ */
+typedef npy_uint64 __pyx_t_5numpy_uint64_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":737
+ * #ctypedef npy_uint128    uint128_t
+ * 
+ * ctypedef npy_float32    float32_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_float64    float64_t
+ * #ctypedef npy_float80    float80_t
+ */
+typedef npy_float32 __pyx_t_5numpy_float32_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":738
+ * 
+ * ctypedef npy_float32    float32_t
+ * ctypedef npy_float64    float64_t             # <<<<<<<<<<<<<<
+ * #ctypedef npy_float80    float80_t
+ * #ctypedef npy_float128   float128_t
+ */
+typedef npy_float64 __pyx_t_5numpy_float64_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":747
+ * # The int types are mapped a bit surprising --
+ * # numpy.int corresponds to 'l' and numpy.long to 'q'
+ * ctypedef npy_long       int_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longlong   long_t
+ * ctypedef npy_longlong   longlong_t
+ */
+typedef npy_long __pyx_t_5numpy_int_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":748
+ * # numpy.int corresponds to 'l' and numpy.long to 'q'
+ * ctypedef npy_long       int_t
+ * ctypedef npy_longlong   long_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longlong   longlong_t
+ * 
+ */
+typedef npy_longlong __pyx_t_5numpy_long_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":749
+ * ctypedef npy_long       int_t
+ * ctypedef npy_longlong   long_t
+ * ctypedef npy_longlong   longlong_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_ulong      uint_t
+ */
+typedef npy_longlong __pyx_t_5numpy_longlong_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":751
+ * ctypedef npy_longlong   longlong_t
+ * 
+ * ctypedef npy_ulong      uint_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_ulonglong  ulong_t
+ * ctypedef npy_ulonglong  ulonglong_t
+ */
+typedef npy_ulong __pyx_t_5numpy_uint_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":752
+ * 
+ * ctypedef npy_ulong      uint_t
+ * ctypedef npy_ulonglong  ulong_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_ulonglong  ulonglong_t
+ * 
+ */
+typedef npy_ulonglong __pyx_t_5numpy_ulong_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":753
+ * ctypedef npy_ulong      uint_t
+ * ctypedef npy_ulonglong  ulong_t
+ * ctypedef npy_ulonglong  ulonglong_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_intp       intp_t
+ */
+typedef npy_ulonglong __pyx_t_5numpy_ulonglong_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":755
+ * ctypedef npy_ulonglong  ulonglong_t
+ * 
+ * ctypedef npy_intp       intp_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_uintp      uintp_t
+ * 
+ */
+typedef npy_intp __pyx_t_5numpy_intp_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":756
+ * 
+ * ctypedef npy_intp       intp_t
+ * ctypedef npy_uintp      uintp_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_double     float_t
+ */
+typedef npy_uintp __pyx_t_5numpy_uintp_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":758
+ * ctypedef npy_uintp      uintp_t
+ * 
+ * ctypedef npy_double     float_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_double     double_t
+ * ctypedef npy_longdouble longdouble_t
+ */
+typedef npy_double __pyx_t_5numpy_float_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":759
+ * 
+ * ctypedef npy_double     float_t
+ * ctypedef npy_double     double_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_longdouble longdouble_t
+ * 
+ */
+typedef npy_double __pyx_t_5numpy_double_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":760
+ * ctypedef npy_double     float_t
+ * ctypedef npy_double     double_t
+ * ctypedef npy_longdouble longdouble_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_cfloat      cfloat_t
+ */
+typedef npy_longdouble __pyx_t_5numpy_longdouble_t;
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    typedef ::std::complex< float > __pyx_t_float_complex;
+  #else
+    typedef float _Complex __pyx_t_float_complex;
+  #endif
+#else
+    typedef struct { float real, imag; } __pyx_t_float_complex;
+#endif
+
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    typedef ::std::complex< double > __pyx_t_double_complex;
+  #else
+    typedef double _Complex __pyx_t_double_complex;
+  #endif
+#else
+    typedef struct { double real, imag; } __pyx_t_double_complex;
+#endif
+
+
+/*--- Type declarations ---*/
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":762
+ * ctypedef npy_longdouble longdouble_t
+ * 
+ * ctypedef npy_cfloat      cfloat_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_cdouble     cdouble_t
+ * ctypedef npy_clongdouble clongdouble_t
+ */
+typedef npy_cfloat __pyx_t_5numpy_cfloat_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":763
+ * 
+ * ctypedef npy_cfloat      cfloat_t
+ * ctypedef npy_cdouble     cdouble_t             # <<<<<<<<<<<<<<
+ * ctypedef npy_clongdouble clongdouble_t
+ * 
+ */
+typedef npy_cdouble __pyx_t_5numpy_cdouble_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":764
+ * ctypedef npy_cfloat      cfloat_t
+ * ctypedef npy_cdouble     cdouble_t
+ * ctypedef npy_clongdouble clongdouble_t             # <<<<<<<<<<<<<<
+ * 
+ * ctypedef npy_cdouble     complex_t
+ */
+typedef npy_clongdouble __pyx_t_5numpy_clongdouble_t;
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":766
+ * ctypedef npy_clongdouble clongdouble_t
+ * 
+ * ctypedef npy_cdouble     complex_t             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):
+ */
+typedef npy_cdouble __pyx_t_5numpy_complex_t;
+#ifndef CYTHON_REFNANNY
+  #define CYTHON_REFNANNY 0
+#endif
+#if CYTHON_REFNANNY
+  typedef struct {
+    void (*INCREF)(void*, PyObject*, int);
+    void (*DECREF)(void*, PyObject*, int);
+    void (*GOTREF)(void*, PyObject*, int);
+    void (*GIVEREF)(void*, PyObject*, int);
+    void* (*SetupContext)(const char*, int, const char*);
+    void (*FinishContext)(void**);
+  } __Pyx_RefNannyAPIStruct;
+  static __Pyx_RefNannyAPIStruct *__Pyx_RefNanny = NULL;
+  static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname); /*proto*/
+  #define __Pyx_RefNannyDeclarations void *__pyx_refnanny = NULL;
+#ifdef WITH_THREAD
+  #define __Pyx_RefNannySetupContext(name, acquire_gil) \
+          if (acquire_gil) { \
+              PyGILState_STATE __pyx_gilstate_save = PyGILState_Ensure(); \
+              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__); \
+              PyGILState_Release(__pyx_gilstate_save); \
+          } else { \
+              __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__); \
+          }
+#else
+  #define __Pyx_RefNannySetupContext(name, acquire_gil) \
+          __pyx_refnanny = __Pyx_RefNanny->SetupContext((name), __LINE__, __FILE__)
+#endif
+  #define __Pyx_RefNannyFinishContext() \
+          __Pyx_RefNanny->FinishContext(&__pyx_refnanny)
+  #define __Pyx_INCREF(r)  __Pyx_RefNanny->INCREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_DECREF(r)  __Pyx_RefNanny->DECREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_GOTREF(r)  __Pyx_RefNanny->GOTREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_GIVEREF(r) __Pyx_RefNanny->GIVEREF(__pyx_refnanny, (PyObject *)(r), __LINE__)
+  #define __Pyx_XINCREF(r)  do { if((r) != NULL) {__Pyx_INCREF(r); }} while(0)
+  #define __Pyx_XDECREF(r)  do { if((r) != NULL) {__Pyx_DECREF(r); }} while(0)
+  #define __Pyx_XGOTREF(r)  do { if((r) != NULL) {__Pyx_GOTREF(r); }} while(0)
+  #define __Pyx_XGIVEREF(r) do { if((r) != NULL) {__Pyx_GIVEREF(r);}} while(0)
+#else
+  #define __Pyx_RefNannyDeclarations
+  #define __Pyx_RefNannySetupContext(name, acquire_gil)
+  #define __Pyx_RefNannyFinishContext()
+  #define __Pyx_INCREF(r) Py_INCREF(r)
+  #define __Pyx_DECREF(r) Py_DECREF(r)
+  #define __Pyx_GOTREF(r)
+  #define __Pyx_GIVEREF(r)
+  #define __Pyx_XINCREF(r) Py_XINCREF(r)
+  #define __Pyx_XDECREF(r) Py_XDECREF(r)
+  #define __Pyx_XGOTREF(r)
+  #define __Pyx_XGIVEREF(r)
+#endif /* CYTHON_REFNANNY */
+#define __Pyx_XDECREF_SET(r, v) do {                            \
+        PyObject *tmp = (PyObject *) r;                         \
+        r = v; __Pyx_XDECREF(tmp);                              \
+    } while (0)
+#define __Pyx_DECREF_SET(r, v) do {                             \
+        PyObject *tmp = (PyObject *) r;                         \
+        r = v; __Pyx_DECREF(tmp);                               \
+    } while (0)
+#define __Pyx_CLEAR(r)    do { PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);} while(0)
+#define __Pyx_XCLEAR(r)   do { if((r) != NULL) {PyObject* tmp = ((PyObject*)(r)); r = NULL; __Pyx_DECREF(tmp);}} while(0)
+
+static void __Pyx_RaiseArgtupleInvalid(const char* func_name, int exact,
+    Py_ssize_t num_min, Py_ssize_t num_max, Py_ssize_t num_found); /*proto*/
+
+static void __Pyx_RaiseDoubleKeywordsError(const char* func_name, PyObject* kw_name); /*proto*/
+
+static int __Pyx_ParseOptionalKeywords(PyObject *kwds, PyObject **argnames[], \
+    PyObject *kwds2, PyObject *values[], Py_ssize_t num_pos_args, \
+    const char* function_name); /*proto*/
+
+static CYTHON_INLINE int __Pyx_ArgTypeTest(PyObject *obj, PyTypeObject *type, int none_allowed,
+    const char *name, int exact); /*proto*/
+
+static CYTHON_INLINE int  __Pyx_GetBufferAndValidate(Py_buffer* buf, PyObject* obj,
+    __Pyx_TypeInfo* dtype, int flags, int nd, int cast, __Pyx_BufFmt_StackElem* stack);
+static CYTHON_INLINE void __Pyx_SafeReleaseBuffer(Py_buffer* info);
+
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject* attr_name) {
+    PyTypeObject* tp = Py_TYPE(obj);
+    if (likely(tp->tp_getattro))
+        return tp->tp_getattro(obj, attr_name);
+#if PY_MAJOR_VERSION < 3
+    if (likely(tp->tp_getattr))
+        return tp->tp_getattr(obj, PyString_AS_STRING(attr_name));
+#endif
+    return PyObject_GetAttr(obj, attr_name);
+}
+#else
+#define __Pyx_PyObject_GetAttrStr(o,n) PyObject_GetAttr(o,n)
+#endif
+
+static PyObject *__Pyx_GetBuiltinName(PyObject *name); /*proto*/
+
+static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name); /*proto*/
+
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw); /*proto*/
+#else
+#define __Pyx_PyObject_Call(func, arg, kw) PyObject_Call(func, arg, kw)
+#endif
+
+static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type); /*proto*/
+
+static void __Pyx_RaiseBufferIndexError(int axis); /*proto*/
+
+#define __Pyx_BufPtrStrided1d(type, buf, i0, s0) (type)((char*)buf + i0 * s0)
+#define __Pyx_BufPtrStrided2d(type, buf, i0, s0, i1, s1) (type)((char*)buf + i0 * s0 + i1 * s1)
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetSlice(
+        PyObject* obj, Py_ssize_t cstart, Py_ssize_t cstop,
+        PyObject** py_start, PyObject** py_stop, PyObject** py_slice,
+        int has_cstart, int has_cstop, int wraparound);
+
+static void __Pyx_RaiseBufferFallbackError(void); /*proto*/
+
+static CYTHON_INLINE void __Pyx_ErrRestore(PyObject *type, PyObject *value, PyObject *tb); /*proto*/
+static CYTHON_INLINE void __Pyx_ErrFetch(PyObject **type, PyObject **value, PyObject **tb); /*proto*/
+
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause); /*proto*/
+
+static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected);
+
+static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index);
+
+static CYTHON_INLINE void __Pyx_RaiseNoneNotIterableError(void);
+
+typedef struct {
+  Py_ssize_t shape, strides, suboffsets;
+} __Pyx_Buf_DimInfo;
+typedef struct {
+  size_t refcount;
+  Py_buffer pybuffer;
+} __Pyx_Buffer;
+typedef struct {
+  __Pyx_Buffer *rcbuffer;
+  char *data;
+  __Pyx_Buf_DimInfo diminfo[8];
+} __Pyx_LocalBuf_ND;
+
+#if PY_MAJOR_VERSION < 3
+    static int __Pyx_GetBuffer(PyObject *obj, Py_buffer *view, int flags);
+    static void __Pyx_ReleaseBuffer(Py_buffer *view);
+#else
+    #define __Pyx_GetBuffer PyObject_GetBuffer
+    #define __Pyx_ReleaseBuffer PyBuffer_Release
+#endif
+
+
+static Py_ssize_t __Pyx_zeros[] = {0, 0, 0, 0, 0, 0, 0, 0};
+static Py_ssize_t __Pyx_minusones[] = {-1, -1, -1, -1, -1, -1, -1, -1};
+
+static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level); /*proto*/
+
+static CYTHON_INLINE npy_int32 __Pyx_PyInt_As_npy_int32(PyObject *);
+
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value);
+
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    #define __Pyx_CREAL(z) ((z).real())
+    #define __Pyx_CIMAG(z) ((z).imag())
+  #else
+    #define __Pyx_CREAL(z) (__real__(z))
+    #define __Pyx_CIMAG(z) (__imag__(z))
+  #endif
+#else
+    #define __Pyx_CREAL(z) ((z).real)
+    #define __Pyx_CIMAG(z) ((z).imag)
+#endif
+#if (defined(_WIN32) || defined(__clang__)) && defined(__cplusplus) && CYTHON_CCOMPLEX
+    #define __Pyx_SET_CREAL(z,x) ((z).real(x))
+    #define __Pyx_SET_CIMAG(z,y) ((z).imag(y))
+#else
+    #define __Pyx_SET_CREAL(z,x) __Pyx_CREAL(z) = (x)
+    #define __Pyx_SET_CIMAG(z,y) __Pyx_CIMAG(z) = (y)
+#endif
+
+static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float, float);
+
+#if CYTHON_CCOMPLEX
+    #define __Pyx_c_eqf(a, b)   ((a)==(b))
+    #define __Pyx_c_sumf(a, b)  ((a)+(b))
+    #define __Pyx_c_difff(a, b) ((a)-(b))
+    #define __Pyx_c_prodf(a, b) ((a)*(b))
+    #define __Pyx_c_quotf(a, b) ((a)/(b))
+    #define __Pyx_c_negf(a)     (-(a))
+  #ifdef __cplusplus
+    #define __Pyx_c_is_zerof(z) ((z)==(float)0)
+    #define __Pyx_c_conjf(z)    (::std::conj(z))
+    #if 1
+        #define __Pyx_c_absf(z)     (::std::abs(z))
+        #define __Pyx_c_powf(a, b)  (::std::pow(a, b))
+    #endif
+  #else
+    #define __Pyx_c_is_zerof(z) ((z)==0)
+    #define __Pyx_c_conjf(z)    (conjf(z))
+    #if 1
+        #define __Pyx_c_absf(z)     (cabsf(z))
+        #define __Pyx_c_powf(a, b)  (cpowf(a, b))
+    #endif
+ #endif
+#else
+    static CYTHON_INLINE int __Pyx_c_eqf(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_sumf(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_difff(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_prodf(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quotf(__pyx_t_float_complex, __pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_negf(__pyx_t_float_complex);
+    static CYTHON_INLINE int __Pyx_c_is_zerof(__pyx_t_float_complex);
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_conjf(__pyx_t_float_complex);
+    #if 1
+        static CYTHON_INLINE float __Pyx_c_absf(__pyx_t_float_complex);
+        static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_powf(__pyx_t_float_complex, __pyx_t_float_complex);
+    #endif
+#endif
+
+static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double, double);
+
+#if CYTHON_CCOMPLEX
+    #define __Pyx_c_eq(a, b)   ((a)==(b))
+    #define __Pyx_c_sum(a, b)  ((a)+(b))
+    #define __Pyx_c_diff(a, b) ((a)-(b))
+    #define __Pyx_c_prod(a, b) ((a)*(b))
+    #define __Pyx_c_quot(a, b) ((a)/(b))
+    #define __Pyx_c_neg(a)     (-(a))
+  #ifdef __cplusplus
+    #define __Pyx_c_is_zero(z) ((z)==(double)0)
+    #define __Pyx_c_conj(z)    (::std::conj(z))
+    #if 1
+        #define __Pyx_c_abs(z)     (::std::abs(z))
+        #define __Pyx_c_pow(a, b)  (::std::pow(a, b))
+    #endif
+  #else
+    #define __Pyx_c_is_zero(z) ((z)==0)
+    #define __Pyx_c_conj(z)    (conj(z))
+    #if 1
+        #define __Pyx_c_abs(z)     (cabs(z))
+        #define __Pyx_c_pow(a, b)  (cpow(a, b))
+    #endif
+ #endif
+#else
+    static CYTHON_INLINE int __Pyx_c_eq(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_sum(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_diff(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_prod(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot(__pyx_t_double_complex, __pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_neg(__pyx_t_double_complex);
+    static CYTHON_INLINE int __Pyx_c_is_zero(__pyx_t_double_complex);
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_conj(__pyx_t_double_complex);
+    #if 1
+        static CYTHON_INLINE double __Pyx_c_abs(__pyx_t_double_complex);
+        static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_pow(__pyx_t_double_complex, __pyx_t_double_complex);
+    #endif
+#endif
+
+static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *);
+
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value);
+
+static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *);
+
+static int __Pyx_check_binary_version(void);
+
+#if !defined(__Pyx_PyIdentifier_FromString)
+#if PY_MAJOR_VERSION < 3
+  #define __Pyx_PyIdentifier_FromString(s) PyString_FromString(s)
+#else
+  #define __Pyx_PyIdentifier_FromString(s) PyUnicode_FromString(s)
+#endif
+#endif
+
+static PyObject *__Pyx_ImportModule(const char *name); /*proto*/
+
+static PyTypeObject *__Pyx_ImportType(const char *module_name, const char *class_name, size_t size, int strict);  /*proto*/
+
+typedef struct {
+    int code_line;
+    PyCodeObject* code_object;
+} __Pyx_CodeObjectCacheEntry;
+struct __Pyx_CodeObjectCache {
+    int count;
+    int max_count;
+    __Pyx_CodeObjectCacheEntry* entries;
+};
+static struct __Pyx_CodeObjectCache __pyx_code_cache = {0,0,NULL};
+static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line);
+static PyCodeObject *__pyx_find_code_object(int code_line);
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object);
+
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename); /*proto*/
+
+static int __Pyx_InitStrings(__Pyx_StringTabEntry *t); /*proto*/
+
+
+/* Module declarations from 'cpython.buffer' */
+
+/* Module declarations from 'cpython.ref' */
+
+/* Module declarations from 'libc.string' */
+
+/* Module declarations from 'libc.stdio' */
+
+/* Module declarations from 'cpython.object' */
+
+/* Module declarations from '__builtin__' */
+
+/* Module declarations from 'cpython.type' */
+static PyTypeObject *__pyx_ptype_7cpython_4type_type = 0;
+
+/* Module declarations from 'libc.stdlib' */
+
+/* Module declarations from 'numpy' */
+
+/* Module declarations from 'numpy' */
+static PyTypeObject *__pyx_ptype_5numpy_dtype = 0;
+static PyTypeObject *__pyx_ptype_5numpy_flatiter = 0;
+static PyTypeObject *__pyx_ptype_5numpy_broadcast = 0;
+static PyTypeObject *__pyx_ptype_5numpy_ndarray = 0;
+static PyTypeObject *__pyx_ptype_5numpy_ufunc = 0;
+static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *, char *, char *, int *); /*proto*/
+
+/* Module declarations from 'nms.gpu_nms' */
+static __Pyx_TypeInfo __Pyx_TypeInfo_nn___pyx_t_5numpy_float32_t = { "float32_t", NULL, sizeof(__pyx_t_5numpy_float32_t), { 0 }, 0, 'R', 0, 0 };
+static __Pyx_TypeInfo __Pyx_TypeInfo_nn___pyx_t_5numpy_int32_t = { "int32_t", NULL, sizeof(__pyx_t_5numpy_int32_t), { 0 }, 0, IS_UNSIGNED(__pyx_t_5numpy_int32_t) ? 'U' : 'I', IS_UNSIGNED(__pyx_t_5numpy_int32_t), 0 };
+static __Pyx_TypeInfo __Pyx_TypeInfo_nn___pyx_t_5numpy_int_t = { "int_t", NULL, sizeof(__pyx_t_5numpy_int_t), { 0 }, 0, IS_UNSIGNED(__pyx_t_5numpy_int_t) ? 'U' : 'I', IS_UNSIGNED(__pyx_t_5numpy_int_t), 0 };
+#define __Pyx_MODULE_NAME "nms.gpu_nms"
+int __pyx_module_is_main_nms__gpu_nms = 0;
+
+/* Implementation of 'nms.gpu_nms' */
+static PyObject *__pyx_builtin_ValueError;
+static PyObject *__pyx_builtin_range;
+static PyObject *__pyx_builtin_RuntimeError;
+static PyObject *__pyx_pf_3nms_7gpu_nms_gpu_nms(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_dets, PyObject *__pyx_v_thresh, __pyx_t_5numpy_int32_t __pyx_v_device_id); /* proto */
+static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /* proto */
+static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info); /* proto */
+static char __pyx_k_B[] = "B";
+static char __pyx_k_H[] = "H";
+static char __pyx_k_I[] = "I";
+static char __pyx_k_L[] = "L";
+static char __pyx_k_O[] = "O";
+static char __pyx_k_Q[] = "Q";
+static char __pyx_k_b[] = "b";
+static char __pyx_k_d[] = "d";
+static char __pyx_k_f[] = "f";
+static char __pyx_k_g[] = "g";
+static char __pyx_k_h[] = "h";
+static char __pyx_k_i[] = "i";
+static char __pyx_k_l[] = "l";
+static char __pyx_k_q[] = "q";
+static char __pyx_k_Zd[] = "Zd";
+static char __pyx_k_Zf[] = "Zf";
+static char __pyx_k_Zg[] = "Zg";
+static char __pyx_k_np[] = "np";
+static char __pyx_k_dets[] = "dets";
+static char __pyx_k_keep[] = "keep";
+static char __pyx_k_main[] = "__main__";
+static char __pyx_k_test[] = "__test__";
+static char __pyx_k_dtype[] = "dtype";
+static char __pyx_k_int32[] = "int32";
+static char __pyx_k_numpy[] = "numpy";
+static char __pyx_k_order[] = "order";
+static char __pyx_k_range[] = "range";
+static char __pyx_k_zeros[] = "zeros";
+static char __pyx_k_import[] = "__import__";
+static char __pyx_k_scores[] = "scores";
+static char __pyx_k_thresh[] = "thresh";
+static char __pyx_k_argsort[] = "argsort";
+static char __pyx_k_gpu_nms[] = "gpu_nms";
+static char __pyx_k_num_out[] = "num_out";
+static char __pyx_k_boxes_dim[] = "boxes_dim";
+static char __pyx_k_boxes_num[] = "boxes_num";
+static char __pyx_k_device_id[] = "device_id";
+static char __pyx_k_ValueError[] = "ValueError";
+static char __pyx_k_nms_gpu_nms[] = "nms.gpu_nms";
+static char __pyx_k_sorted_dets[] = "sorted_dets";
+static char __pyx_k_RuntimeError[] = "RuntimeError";
+static char __pyx_k_pyx_getbuffer[] = "__pyx_getbuffer";
+static char __pyx_k_pyx_releasebuffer[] = "__pyx_releasebuffer";
+static char __pyx_k_ndarray_is_not_C_contiguous[] = "ndarray is not C contiguous";
+static char __pyx_k_nfs_yoda_xinleic_Inf_Code_Faste[] = "/nfs.yoda/xinleic/Inf/Code/Faster-RCNN_TF/lib/nms/gpu_nms.pyx";
+static char __pyx_k_unknown_dtype_code_in_numpy_pxd[] = "unknown dtype code in numpy.pxd (%d)";
+static char __pyx_k_Format_string_allocated_too_shor[] = "Format string allocated too short, see comment in numpy.pxd";
+static char __pyx_k_Non_native_byte_order_not_suppor[] = "Non-native byte order not supported";
+static char __pyx_k_ndarray_is_not_Fortran_contiguou[] = "ndarray is not Fortran contiguous";
+static char __pyx_k_Format_string_allocated_too_shor_2[] = "Format string allocated too short.";
+static PyObject *__pyx_kp_u_Format_string_allocated_too_shor;
+static PyObject *__pyx_kp_u_Format_string_allocated_too_shor_2;
+static PyObject *__pyx_kp_u_Non_native_byte_order_not_suppor;
+static PyObject *__pyx_n_s_RuntimeError;
+static PyObject *__pyx_n_s_ValueError;
+static PyObject *__pyx_n_s_argsort;
+static PyObject *__pyx_n_s_boxes_dim;
+static PyObject *__pyx_n_s_boxes_num;
+static PyObject *__pyx_n_s_dets;
+static PyObject *__pyx_n_s_device_id;
+static PyObject *__pyx_n_s_dtype;
+static PyObject *__pyx_n_s_gpu_nms;
+static PyObject *__pyx_n_s_import;
+static PyObject *__pyx_n_s_int32;
+static PyObject *__pyx_n_s_keep;
+static PyObject *__pyx_n_s_main;
+static PyObject *__pyx_kp_u_ndarray_is_not_C_contiguous;
+static PyObject *__pyx_kp_u_ndarray_is_not_Fortran_contiguou;
+static PyObject *__pyx_kp_s_nfs_yoda_xinleic_Inf_Code_Faste;
+static PyObject *__pyx_n_s_nms_gpu_nms;
+static PyObject *__pyx_n_s_np;
+static PyObject *__pyx_n_s_num_out;
+static PyObject *__pyx_n_s_numpy;
+static PyObject *__pyx_n_s_order;
+static PyObject *__pyx_n_s_pyx_getbuffer;
+static PyObject *__pyx_n_s_pyx_releasebuffer;
+static PyObject *__pyx_n_s_range;
+static PyObject *__pyx_n_s_scores;
+static PyObject *__pyx_n_s_sorted_dets;
+static PyObject *__pyx_n_s_test;
+static PyObject *__pyx_n_s_thresh;
+static PyObject *__pyx_kp_u_unknown_dtype_code_in_numpy_pxd;
+static PyObject *__pyx_n_s_zeros;
+static PyObject *__pyx_int_4;
+static PyObject *__pyx_int_neg_1;
+static PyObject *__pyx_slice_;
+static PyObject *__pyx_slice__3;
+static PyObject *__pyx_slice__4;
+static PyObject *__pyx_tuple__2;
+static PyObject *__pyx_tuple__5;
+static PyObject *__pyx_tuple__6;
+static PyObject *__pyx_tuple__7;
+static PyObject *__pyx_tuple__8;
+static PyObject *__pyx_tuple__9;
+static PyObject *__pyx_tuple__10;
+static PyObject *__pyx_tuple__11;
+static PyObject *__pyx_codeobj__12;
+
+/* "nms/gpu_nms.pyx":16
+ *     void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int)
+ * 
+ * def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,             # <<<<<<<<<<<<<<
+ *             np.int32_t device_id=0):
+ *     cdef int boxes_num = dets.shape[0]
+ */
+
+/* Python wrapper */
+static PyObject *__pyx_pw_3nms_7gpu_nms_1gpu_nms(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/
+static PyMethodDef __pyx_mdef_3nms_7gpu_nms_1gpu_nms = {__Pyx_NAMESTR("gpu_nms"), (PyCFunction)__pyx_pw_3nms_7gpu_nms_1gpu_nms, METH_VARARGS|METH_KEYWORDS, __Pyx_DOCSTR(0)};
+static PyObject *__pyx_pw_3nms_7gpu_nms_1gpu_nms(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) {
+  PyArrayObject *__pyx_v_dets = 0;
+  PyObject *__pyx_v_thresh = 0;
+  __pyx_t_5numpy_int32_t __pyx_v_device_id;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  PyObject *__pyx_r = 0;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("gpu_nms (wrapper)", 0);
+  {
+    static PyObject **__pyx_pyargnames[] = {&__pyx_n_s_dets,&__pyx_n_s_thresh,&__pyx_n_s_device_id,0};
+    PyObject* values[3] = {0,0,0};
+    if (unlikely(__pyx_kwds)) {
+      Py_ssize_t kw_args;
+      const Py_ssize_t pos_args = PyTuple_GET_SIZE(__pyx_args);
+      switch (pos_args) {
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        case  1: values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        case  0: break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+      kw_args = PyDict_Size(__pyx_kwds);
+      switch (pos_args) {
+        case  0:
+        if (likely((values[0] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_dets)) != 0)) kw_args--;
+        else goto __pyx_L5_argtuple_error;
+        case  1:
+        if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_thresh)) != 0)) kw_args--;
+        else {
+          __Pyx_RaiseArgtupleInvalid("gpu_nms", 0, 2, 3, 1); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+        }
+        case  2:
+        if (kw_args > 0) {
+          PyObject* value = PyDict_GetItem(__pyx_kwds, __pyx_n_s_device_id);
+          if (value) { values[2] = value; kw_args--; }
+        }
+      }
+      if (unlikely(kw_args > 0)) {
+        if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "gpu_nms") < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+      }
+    } else {
+      switch (PyTuple_GET_SIZE(__pyx_args)) {
+        case  3: values[2] = PyTuple_GET_ITEM(__pyx_args, 2);
+        case  2: values[1] = PyTuple_GET_ITEM(__pyx_args, 1);
+        values[0] = PyTuple_GET_ITEM(__pyx_args, 0);
+        break;
+        default: goto __pyx_L5_argtuple_error;
+      }
+    }
+    __pyx_v_dets = ((PyArrayObject *)values[0]);
+    __pyx_v_thresh = ((PyObject*)values[1]);
+    if (values[2]) {
+      __pyx_v_device_id = __Pyx_PyInt_As_npy_int32(values[2]); if (unlikely((__pyx_v_device_id == (npy_int32)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 17; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+    } else {
+      __pyx_v_device_id = ((__pyx_t_5numpy_int32_t)0);
+    }
+  }
+  goto __pyx_L4_argument_unpacking_done;
+  __pyx_L5_argtuple_error:;
+  __Pyx_RaiseArgtupleInvalid("gpu_nms", 0, 2, 3, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L3_error;}
+  __pyx_L3_error:;
+  __Pyx_AddTraceback("nms.gpu_nms.gpu_nms", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __Pyx_RefNannyFinishContext();
+  return NULL;
+  __pyx_L4_argument_unpacking_done:;
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_dets), __pyx_ptype_5numpy_ndarray, 1, "dets", 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_thresh), (&PyFloat_Type), 1, "thresh", 1))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_r = __pyx_pf_3nms_7gpu_nms_gpu_nms(__pyx_self, __pyx_v_dets, __pyx_v_thresh, __pyx_v_device_id);
+
+  /* function exit code */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyObject *__pyx_pf_3nms_7gpu_nms_gpu_nms(CYTHON_UNUSED PyObject *__pyx_self, PyArrayObject *__pyx_v_dets, PyObject *__pyx_v_thresh, __pyx_t_5numpy_int32_t __pyx_v_device_id) {
+  int __pyx_v_boxes_num;
+  int __pyx_v_boxes_dim;
+  int __pyx_v_num_out;
+  PyArrayObject *__pyx_v_keep = 0;
+  PyArrayObject *__pyx_v_scores = 0;
+  PyArrayObject *__pyx_v_order = 0;
+  PyArrayObject *__pyx_v_sorted_dets = 0;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_dets;
+  __Pyx_Buffer __pyx_pybuffer_dets;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_keep;
+  __Pyx_Buffer __pyx_pybuffer_keep;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_order;
+  __Pyx_Buffer __pyx_pybuffer_order;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_scores;
+  __Pyx_Buffer __pyx_pybuffer_scores;
+  __Pyx_LocalBuf_ND __pyx_pybuffernd_sorted_dets;
+  __Pyx_Buffer __pyx_pybuffer_sorted_dets;
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  PyObject *__pyx_t_2 = NULL;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  PyObject *__pyx_t_5 = NULL;
+  PyArrayObject *__pyx_t_6 = NULL;
+  PyArrayObject *__pyx_t_7 = NULL;
+  PyArrayObject *__pyx_t_8 = NULL;
+  PyArrayObject *__pyx_t_9 = NULL;
+  long __pyx_t_10;
+  int __pyx_t_11;
+  long __pyx_t_12;
+  long __pyx_t_13;
+  float __pyx_t_14;
+  PyObject *__pyx_t_15 = NULL;
+  PyObject *__pyx_t_16 = NULL;
+  PyObject *__pyx_t_17 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("gpu_nms", 0);
+  __pyx_pybuffer_keep.pybuffer.buf = NULL;
+  __pyx_pybuffer_keep.refcount = 0;
+  __pyx_pybuffernd_keep.data = NULL;
+  __pyx_pybuffernd_keep.rcbuffer = &__pyx_pybuffer_keep;
+  __pyx_pybuffer_scores.pybuffer.buf = NULL;
+  __pyx_pybuffer_scores.refcount = 0;
+  __pyx_pybuffernd_scores.data = NULL;
+  __pyx_pybuffernd_scores.rcbuffer = &__pyx_pybuffer_scores;
+  __pyx_pybuffer_order.pybuffer.buf = NULL;
+  __pyx_pybuffer_order.refcount = 0;
+  __pyx_pybuffernd_order.data = NULL;
+  __pyx_pybuffernd_order.rcbuffer = &__pyx_pybuffer_order;
+  __pyx_pybuffer_sorted_dets.pybuffer.buf = NULL;
+  __pyx_pybuffer_sorted_dets.refcount = 0;
+  __pyx_pybuffernd_sorted_dets.data = NULL;
+  __pyx_pybuffernd_sorted_dets.rcbuffer = &__pyx_pybuffer_sorted_dets;
+  __pyx_pybuffer_dets.pybuffer.buf = NULL;
+  __pyx_pybuffer_dets.refcount = 0;
+  __pyx_pybuffernd_dets.data = NULL;
+  __pyx_pybuffernd_dets.rcbuffer = &__pyx_pybuffer_dets;
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_dets.rcbuffer->pybuffer, (PyObject*)__pyx_v_dets, &__Pyx_TypeInfo_nn___pyx_t_5numpy_float32_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  }
+  __pyx_pybuffernd_dets.diminfo[0].strides = __pyx_pybuffernd_dets.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_dets.diminfo[0].shape = __pyx_pybuffernd_dets.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_dets.diminfo[1].strides = __pyx_pybuffernd_dets.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_dets.diminfo[1].shape = __pyx_pybuffernd_dets.rcbuffer->pybuffer.shape[1];
+
+  /* "nms/gpu_nms.pyx":18
+ * def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,
+ *             np.int32_t device_id=0):
+ *     cdef int boxes_num = dets.shape[0]             # <<<<<<<<<<<<<<
+ *     cdef int boxes_dim = dets.shape[1]
+ *     cdef int num_out
+ */
+  __pyx_v_boxes_num = (__pyx_v_dets->dimensions[0]);
+
+  /* "nms/gpu_nms.pyx":19
+ *             np.int32_t device_id=0):
+ *     cdef int boxes_num = dets.shape[0]
+ *     cdef int boxes_dim = dets.shape[1]             # <<<<<<<<<<<<<<
+ *     cdef int num_out
+ *     cdef np.ndarray[np.int32_t, ndim=1] \
+ */
+  __pyx_v_boxes_dim = (__pyx_v_dets->dimensions[1]);
+
+  /* "nms/gpu_nms.pyx":22
+ *     cdef int num_out
+ *     cdef np.ndarray[np.int32_t, ndim=1] \
+ *         keep = np.zeros(boxes_num, dtype=np.int32)             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.float32_t, ndim=1] \
+ *         scores = dets[:, 4]
+ */
+  __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_zeros); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_2);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyInt_From_int(__pyx_v_boxes_num); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_3 = PyTuple_New(1); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_3);
+  PyTuple_SET_ITEM(__pyx_t_3, 0, __pyx_t_1);
+  __Pyx_GIVEREF(__pyx_t_1);
+  __pyx_t_1 = 0;
+  __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_4 = __Pyx_GetModuleGlobalName(__pyx_n_s_np); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_4);
+  __pyx_t_5 = __Pyx_PyObject_GetAttrStr(__pyx_t_4, __pyx_n_s_int32); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+  if (PyDict_SetItem(__pyx_t_1, __pyx_n_s_dtype, __pyx_t_5) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_t_5 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_3, __pyx_t_1); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
+  __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (!(likely(((__pyx_t_5) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_5, __pyx_ptype_5numpy_ndarray))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 22; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_6 = ((PyArrayObject *)__pyx_t_5);
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_keep.rcbuffer->pybuffer, (PyObject*)__pyx_t_6, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {
+      __pyx_v_keep = ((PyArrayObject *)Py_None); __Pyx_INCREF(Py_None); __pyx_pybuffernd_keep.rcbuffer->pybuffer.buf = NULL;
+      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 21; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    } else {__pyx_pybuffernd_keep.diminfo[0].strides = __pyx_pybuffernd_keep.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_keep.diminfo[0].shape = __pyx_pybuffernd_keep.rcbuffer->pybuffer.shape[0];
+    }
+  }
+  __pyx_t_6 = 0;
+  __pyx_v_keep = ((PyArrayObject *)__pyx_t_5);
+  __pyx_t_5 = 0;
+
+  /* "nms/gpu_nms.pyx":24
+ *         keep = np.zeros(boxes_num, dtype=np.int32)
+ *     cdef np.ndarray[np.float32_t, ndim=1] \
+ *         scores = dets[:, 4]             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.int_t, ndim=1] \
+ *         order = scores.argsort()[::-1]
+ */
+  __pyx_t_5 = PyObject_GetItem(((PyObject *)__pyx_v_dets), __pyx_tuple__2); if (unlikely(__pyx_t_5 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 24; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+  __Pyx_GOTREF(__pyx_t_5);
+  if (!(likely(((__pyx_t_5) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_5, __pyx_ptype_5numpy_ndarray))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 24; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_7 = ((PyArrayObject *)__pyx_t_5);
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_scores.rcbuffer->pybuffer, (PyObject*)__pyx_t_7, &__Pyx_TypeInfo_nn___pyx_t_5numpy_float32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {
+      __pyx_v_scores = ((PyArrayObject *)Py_None); __Pyx_INCREF(Py_None); __pyx_pybuffernd_scores.rcbuffer->pybuffer.buf = NULL;
+      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 23; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    } else {__pyx_pybuffernd_scores.diminfo[0].strides = __pyx_pybuffernd_scores.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_scores.diminfo[0].shape = __pyx_pybuffernd_scores.rcbuffer->pybuffer.shape[0];
+    }
+  }
+  __pyx_t_7 = 0;
+  __pyx_v_scores = ((PyArrayObject *)__pyx_t_5);
+  __pyx_t_5 = 0;
+
+  /* "nms/gpu_nms.pyx":26
+ *         scores = dets[:, 4]
+ *     cdef np.ndarray[np.int_t, ndim=1] \
+ *         order = scores.argsort()[::-1]             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.float32_t, ndim=2] \
+ *         sorted_dets = dets[order, :]
+ */
+  __pyx_t_5 = __Pyx_PyObject_GetAttrStr(((PyObject *)__pyx_v_scores), __pyx_n_s_argsort); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 26; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_5);
+  __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_5, __pyx_empty_tuple, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 26; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_t_5 = PyObject_GetItem(__pyx_t_1, __pyx_slice__3); if (unlikely(__pyx_t_5 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 26; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+  if (!(likely(((__pyx_t_5) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_5, __pyx_ptype_5numpy_ndarray))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 26; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_8 = ((PyArrayObject *)__pyx_t_5);
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_order.rcbuffer->pybuffer, (PyObject*)__pyx_t_8, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {
+      __pyx_v_order = ((PyArrayObject *)Py_None); __Pyx_INCREF(Py_None); __pyx_pybuffernd_order.rcbuffer->pybuffer.buf = NULL;
+      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 25; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    } else {__pyx_pybuffernd_order.diminfo[0].strides = __pyx_pybuffernd_order.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_order.diminfo[0].shape = __pyx_pybuffernd_order.rcbuffer->pybuffer.shape[0];
+    }
+  }
+  __pyx_t_8 = 0;
+  __pyx_v_order = ((PyArrayObject *)__pyx_t_5);
+  __pyx_t_5 = 0;
+
+  /* "nms/gpu_nms.pyx":28
+ *         order = scores.argsort()[::-1]
+ *     cdef np.ndarray[np.float32_t, ndim=2] \
+ *         sorted_dets = dets[order, :]             # <<<<<<<<<<<<<<
+ *     _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id)
+ *     keep = keep[:num_out]
+ */
+  __pyx_t_5 = PyTuple_New(2); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_5);
+  __Pyx_INCREF(((PyObject *)__pyx_v_order));
+  PyTuple_SET_ITEM(__pyx_t_5, 0, ((PyObject *)__pyx_v_order));
+  __Pyx_GIVEREF(((PyObject *)__pyx_v_order));
+  __Pyx_INCREF(__pyx_slice__4);
+  PyTuple_SET_ITEM(__pyx_t_5, 1, __pyx_slice__4);
+  __Pyx_GIVEREF(__pyx_slice__4);
+  __pyx_t_1 = PyObject_GetItem(((PyObject *)__pyx_v_dets), __pyx_t_5); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_9 = ((PyArrayObject *)__pyx_t_1);
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_sorted_dets.rcbuffer->pybuffer, (PyObject*)__pyx_t_9, &__Pyx_TypeInfo_nn___pyx_t_5numpy_float32_t, PyBUF_FORMAT| PyBUF_STRIDES, 2, 0, __pyx_stack) == -1)) {
+      __pyx_v_sorted_dets = ((PyArrayObject *)Py_None); __Pyx_INCREF(Py_None); __pyx_pybuffernd_sorted_dets.rcbuffer->pybuffer.buf = NULL;
+      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    } else {__pyx_pybuffernd_sorted_dets.diminfo[0].strides = __pyx_pybuffernd_sorted_dets.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_sorted_dets.diminfo[0].shape = __pyx_pybuffernd_sorted_dets.rcbuffer->pybuffer.shape[0]; __pyx_pybuffernd_sorted_dets.diminfo[1].strides = __pyx_pybuffernd_sorted_dets.rcbuffer->pybuffer.strides[1]; __pyx_pybuffernd_sorted_dets.diminfo[1].shape = __pyx_pybuffernd_sorted_dets.rcbuffer->pybuffer.shape[1];
+    }
+  }
+  __pyx_t_9 = 0;
+  __pyx_v_sorted_dets = ((PyArrayObject *)__pyx_t_1);
+  __pyx_t_1 = 0;
+
+  /* "nms/gpu_nms.pyx":29
+ *     cdef np.ndarray[np.float32_t, ndim=2] \
+ *         sorted_dets = dets[order, :]
+ *     _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id)             # <<<<<<<<<<<<<<
+ *     keep = keep[:num_out]
+ *     return list(order[keep])
+ */
+  __pyx_t_10 = 0;
+  __pyx_t_11 = -1;
+  if (__pyx_t_10 < 0) {
+    __pyx_t_10 += __pyx_pybuffernd_keep.diminfo[0].shape;
+    if (unlikely(__pyx_t_10 < 0)) __pyx_t_11 = 0;
+  } else if (unlikely(__pyx_t_10 >= __pyx_pybuffernd_keep.diminfo[0].shape)) __pyx_t_11 = 0;
+  if (unlikely(__pyx_t_11 != -1)) {
+    __Pyx_RaiseBufferIndexError(__pyx_t_11);
+    {__pyx_filename = __pyx_f[0]; __pyx_lineno = 29; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  }
+  __pyx_t_12 = 0;
+  __pyx_t_13 = 0;
+  __pyx_t_11 = -1;
+  if (__pyx_t_12 < 0) {
+    __pyx_t_12 += __pyx_pybuffernd_sorted_dets.diminfo[0].shape;
+    if (unlikely(__pyx_t_12 < 0)) __pyx_t_11 = 0;
+  } else if (unlikely(__pyx_t_12 >= __pyx_pybuffernd_sorted_dets.diminfo[0].shape)) __pyx_t_11 = 0;
+  if (__pyx_t_13 < 0) {
+    __pyx_t_13 += __pyx_pybuffernd_sorted_dets.diminfo[1].shape;
+    if (unlikely(__pyx_t_13 < 0)) __pyx_t_11 = 1;
+  } else if (unlikely(__pyx_t_13 >= __pyx_pybuffernd_sorted_dets.diminfo[1].shape)) __pyx_t_11 = 1;
+  if (unlikely(__pyx_t_11 != -1)) {
+    __Pyx_RaiseBufferIndexError(__pyx_t_11);
+    {__pyx_filename = __pyx_f[0]; __pyx_lineno = 29; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  }
+  __pyx_t_14 = __pyx_PyFloat_AsFloat(__pyx_v_thresh); if (unlikely((__pyx_t_14 == (float)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 29; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  _nms((&(*__Pyx_BufPtrStrided1d(__pyx_t_5numpy_int32_t *, __pyx_pybuffernd_keep.rcbuffer->pybuffer.buf, __pyx_t_10, __pyx_pybuffernd_keep.diminfo[0].strides))), (&__pyx_v_num_out), (&(*__Pyx_BufPtrStrided2d(__pyx_t_5numpy_float32_t *, __pyx_pybuffernd_sorted_dets.rcbuffer->pybuffer.buf, __pyx_t_12, __pyx_pybuffernd_sorted_dets.diminfo[0].strides, __pyx_t_13, __pyx_pybuffernd_sorted_dets.diminfo[1].strides))), __pyx_v_boxes_num, __pyx_v_boxes_dim, __pyx_t_14, __pyx_v_device_id);
+
+  /* "nms/gpu_nms.pyx":30
+ *         sorted_dets = dets[order, :]
+ *     _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id)
+ *     keep = keep[:num_out]             # <<<<<<<<<<<<<<
+ *     return list(order[keep])
+ */
+  __pyx_t_1 = __Pyx_PyObject_GetSlice(((PyObject *)__pyx_v_keep), 0, __pyx_v_num_out, NULL, NULL, NULL, 0, 1, 1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  if (!(likely(((__pyx_t_1) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_1, __pyx_ptype_5numpy_ndarray))))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_t_6 = ((PyArrayObject *)__pyx_t_1);
+  {
+    __Pyx_BufFmt_StackElem __pyx_stack[1];
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_keep.rcbuffer->pybuffer);
+    __pyx_t_11 = __Pyx_GetBufferAndValidate(&__pyx_pybuffernd_keep.rcbuffer->pybuffer, (PyObject*)__pyx_t_6, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack);
+    if (unlikely(__pyx_t_11 < 0)) {
+      PyErr_Fetch(&__pyx_t_15, &__pyx_t_16, &__pyx_t_17);
+      if (unlikely(__Pyx_GetBufferAndValidate(&__pyx_pybuffernd_keep.rcbuffer->pybuffer, (PyObject*)__pyx_v_keep, &__Pyx_TypeInfo_nn___pyx_t_5numpy_int32_t, PyBUF_FORMAT| PyBUF_STRIDES, 1, 0, __pyx_stack) == -1)) {
+        Py_XDECREF(__pyx_t_15); Py_XDECREF(__pyx_t_16); Py_XDECREF(__pyx_t_17);
+        __Pyx_RaiseBufferFallbackError();
+      } else {
+        PyErr_Restore(__pyx_t_15, __pyx_t_16, __pyx_t_17);
+      }
+    }
+    __pyx_pybuffernd_keep.diminfo[0].strides = __pyx_pybuffernd_keep.rcbuffer->pybuffer.strides[0]; __pyx_pybuffernd_keep.diminfo[0].shape = __pyx_pybuffernd_keep.rcbuffer->pybuffer.shape[0];
+    if (unlikely(__pyx_t_11 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  }
+  __pyx_t_6 = 0;
+  __Pyx_DECREF_SET(__pyx_v_keep, ((PyArrayObject *)__pyx_t_1));
+  __pyx_t_1 = 0;
+
+  /* "nms/gpu_nms.pyx":31
+ *     _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id)
+ *     keep = keep[:num_out]
+ *     return list(order[keep])             # <<<<<<<<<<<<<<
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyObject_GetItem(((PyObject *)__pyx_v_order), ((PyObject *)__pyx_v_keep)); if (unlikely(__pyx_t_1 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_t_5 = PyTuple_New(1); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_5);
+  PyTuple_SET_ITEM(__pyx_t_5, 0, __pyx_t_1);
+  __Pyx_GIVEREF(__pyx_t_1);
+  __pyx_t_1 = 0;
+  __pyx_t_1 = __Pyx_PyObject_Call(((PyObject *)((PyObject*)(&PyList_Type))), __pyx_t_5, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __Pyx_DECREF(__pyx_t_5); __pyx_t_5 = 0;
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "nms/gpu_nms.pyx":16
+ *     void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int)
+ * 
+ * def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,             # <<<<<<<<<<<<<<
+ *             np.int32_t device_id=0):
+ *     cdef int boxes_num = dets.shape[0]
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_2);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_5);
+  { PyObject *__pyx_type, *__pyx_value, *__pyx_tb;
+    __Pyx_ErrFetch(&__pyx_type, &__pyx_value, &__pyx_tb);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_dets.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_keep.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_order.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_scores.rcbuffer->pybuffer);
+    __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_sorted_dets.rcbuffer->pybuffer);
+  __Pyx_ErrRestore(__pyx_type, __pyx_value, __pyx_tb);}
+  __Pyx_AddTraceback("nms.gpu_nms.gpu_nms", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  goto __pyx_L2;
+  __pyx_L0:;
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_dets.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_keep.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_order.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_scores.rcbuffer->pybuffer);
+  __Pyx_SafeReleaseBuffer(&__pyx_pybuffernd_sorted_dets.rcbuffer->pybuffer);
+  __pyx_L2:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_keep);
+  __Pyx_XDECREF((PyObject *)__pyx_v_scores);
+  __Pyx_XDECREF((PyObject *)__pyx_v_order);
+  __Pyx_XDECREF((PyObject *)__pyx_v_sorted_dets);
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":194
+ *         # experimental exception made for __getbuffer__ and __releasebuffer__
+ *         # -- the details of this may change.
+ *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
+ *             # This implementation of getbuffer is geared towards Cython
+ *             # requirements, and does not yet fullfill the PEP.
+ */
+
+/* Python wrapper */
+static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags); /*proto*/
+static CYTHON_UNUSED int __pyx_pw_5numpy_7ndarray_1__getbuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__getbuffer__ (wrapper)", 0);
+  __pyx_r = __pyx_pf_5numpy_7ndarray___getbuffer__(((PyArrayObject *)__pyx_v_self), ((Py_buffer *)__pyx_v_info), ((int)__pyx_v_flags));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static int __pyx_pf_5numpy_7ndarray___getbuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info, int __pyx_v_flags) {
+  int __pyx_v_copy_shape;
+  int __pyx_v_i;
+  int __pyx_v_ndim;
+  int __pyx_v_endian_detector;
+  int __pyx_v_little_endian;
+  int __pyx_v_t;
+  char *__pyx_v_f;
+  PyArray_Descr *__pyx_v_descr = 0;
+  int __pyx_v_offset;
+  int __pyx_v_hasfields;
+  int __pyx_r;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  int __pyx_t_2;
+  int __pyx_t_3;
+  PyObject *__pyx_t_4 = NULL;
+  int __pyx_t_5;
+  int __pyx_t_6;
+  int __pyx_t_7;
+  PyObject *__pyx_t_8 = NULL;
+  char *__pyx_t_9;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("__getbuffer__", 0);
+  if (__pyx_v_info != NULL) {
+    __pyx_v_info->obj = Py_None; __Pyx_INCREF(Py_None);
+    __Pyx_GIVEREF(__pyx_v_info->obj);
+  }
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":200
+ *             # of flags
+ * 
+ *             if info == NULL: return             # <<<<<<<<<<<<<<
+ * 
+ *             cdef int copy_shape, i, ndim
+ */
+  __pyx_t_1 = ((__pyx_v_info == NULL) != 0);
+  if (__pyx_t_1) {
+    __pyx_r = 0;
+    goto __pyx_L0;
+  }
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":203
+ * 
+ *             cdef int copy_shape, i, ndim
+ *             cdef int endian_detector = 1             # <<<<<<<<<<<<<<
+ *             cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)
+ * 
+ */
+  __pyx_v_endian_detector = 1;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":204
+ *             cdef int copy_shape, i, ndim
+ *             cdef int endian_detector = 1
+ *             cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)             # <<<<<<<<<<<<<<
+ * 
+ *             ndim = PyArray_NDIM(self)
+ */
+  __pyx_v_little_endian = ((((char *)(&__pyx_v_endian_detector))[0]) != 0);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":206
+ *             cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)
+ * 
+ *             ndim = PyArray_NDIM(self)             # <<<<<<<<<<<<<<
+ * 
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ */
+  __pyx_v_ndim = PyArray_NDIM(__pyx_v_self);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":208
+ *             ndim = PyArray_NDIM(self)
+ * 
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):             # <<<<<<<<<<<<<<
+ *                 copy_shape = 1
+ *             else:
+ */
+  __pyx_t_1 = (((sizeof(npy_intp)) != (sizeof(Py_ssize_t))) != 0);
+  if (__pyx_t_1) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":209
+ * 
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ *                 copy_shape = 1             # <<<<<<<<<<<<<<
+ *             else:
+ *                 copy_shape = 0
+ */
+    __pyx_v_copy_shape = 1;
+    goto __pyx_L4;
+  }
+  /*else*/ {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":211
+ *                 copy_shape = 1
+ *             else:
+ *                 copy_shape = 0             # <<<<<<<<<<<<<<
+ * 
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ */
+    __pyx_v_copy_shape = 0;
+  }
+  __pyx_L4:;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":213
+ *                 copy_shape = 0
+ * 
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)             # <<<<<<<<<<<<<<
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ */
+  __pyx_t_1 = (((__pyx_v_flags & PyBUF_C_CONTIGUOUS) == PyBUF_C_CONTIGUOUS) != 0);
+  if (__pyx_t_1) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":214
+ * 
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):             # <<<<<<<<<<<<<<
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ * 
+ */
+    __pyx_t_2 = ((!(PyArray_CHKFLAGS(__pyx_v_self, NPY_C_CONTIGUOUS) != 0)) != 0);
+    __pyx_t_3 = __pyx_t_2;
+  } else {
+    __pyx_t_3 = __pyx_t_1;
+  }
+  if (__pyx_t_3) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":215
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not C contiguous")             # <<<<<<<<<<<<<<
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ */
+    __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__5, NULL); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 215; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_4);
+    __Pyx_Raise(__pyx_t_4, 0, 0, 0);
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 215; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  }
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":217
+ *                 raise ValueError(u"ndarray is not C contiguous")
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)             # <<<<<<<<<<<<<<
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")
+ */
+  __pyx_t_3 = (((__pyx_v_flags & PyBUF_F_CONTIGUOUS) == PyBUF_F_CONTIGUOUS) != 0);
+  if (__pyx_t_3) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":218
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):             # <<<<<<<<<<<<<<
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")
+ * 
+ */
+    __pyx_t_1 = ((!(PyArray_CHKFLAGS(__pyx_v_self, NPY_F_CONTIGUOUS) != 0)) != 0);
+    __pyx_t_2 = __pyx_t_1;
+  } else {
+    __pyx_t_2 = __pyx_t_3;
+  }
+  if (__pyx_t_2) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":219
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")             # <<<<<<<<<<<<<<
+ * 
+ *             info.buf = PyArray_DATA(self)
+ */
+    __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__6, NULL); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 219; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_4);
+    __Pyx_Raise(__pyx_t_4, 0, 0, 0);
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 219; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  }
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":221
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")
+ * 
+ *             info.buf = PyArray_DATA(self)             # <<<<<<<<<<<<<<
+ *             info.ndim = ndim
+ *             if copy_shape:
+ */
+  __pyx_v_info->buf = PyArray_DATA(__pyx_v_self);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":222
+ * 
+ *             info.buf = PyArray_DATA(self)
+ *             info.ndim = ndim             # <<<<<<<<<<<<<<
+ *             if copy_shape:
+ *                 # Allocate new buffer for strides and shape info.
+ */
+  __pyx_v_info->ndim = __pyx_v_ndim;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":223
+ *             info.buf = PyArray_DATA(self)
+ *             info.ndim = ndim
+ *             if copy_shape:             # <<<<<<<<<<<<<<
+ *                 # Allocate new buffer for strides and shape info.
+ *                 # This is allocated as one block, strides first.
+ */
+  __pyx_t_2 = (__pyx_v_copy_shape != 0);
+  if (__pyx_t_2) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":226
+ *                 # Allocate new buffer for strides and shape info.
+ *                 # This is allocated as one block, strides first.
+ *                 info.strides = <Py_ssize_t*>stdlib.malloc(sizeof(Py_ssize_t) * <size_t>ndim * 2)             # <<<<<<<<<<<<<<
+ *                 info.shape = info.strides + ndim
+ *                 for i in range(ndim):
+ */
+    __pyx_v_info->strides = ((Py_ssize_t *)malloc((((sizeof(Py_ssize_t)) * ((size_t)__pyx_v_ndim)) * 2)));
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":227
+ *                 # This is allocated as one block, strides first.
+ *                 info.strides = <Py_ssize_t*>stdlib.malloc(sizeof(Py_ssize_t) * <size_t>ndim * 2)
+ *                 info.shape = info.strides + ndim             # <<<<<<<<<<<<<<
+ *                 for i in range(ndim):
+ *                     info.strides[i] = PyArray_STRIDES(self)[i]
+ */
+    __pyx_v_info->shape = (__pyx_v_info->strides + __pyx_v_ndim);
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":228
+ *                 info.strides = <Py_ssize_t*>stdlib.malloc(sizeof(Py_ssize_t) * <size_t>ndim * 2)
+ *                 info.shape = info.strides + ndim
+ *                 for i in range(ndim):             # <<<<<<<<<<<<<<
+ *                     info.strides[i] = PyArray_STRIDES(self)[i]
+ *                     info.shape[i] = PyArray_DIMS(self)[i]
+ */
+    __pyx_t_5 = __pyx_v_ndim;
+    for (__pyx_t_6 = 0; __pyx_t_6 < __pyx_t_5; __pyx_t_6+=1) {
+      __pyx_v_i = __pyx_t_6;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":229
+ *                 info.shape = info.strides + ndim
+ *                 for i in range(ndim):
+ *                     info.strides[i] = PyArray_STRIDES(self)[i]             # <<<<<<<<<<<<<<
+ *                     info.shape[i] = PyArray_DIMS(self)[i]
+ *             else:
+ */
+      (__pyx_v_info->strides[__pyx_v_i]) = (PyArray_STRIDES(__pyx_v_self)[__pyx_v_i]);
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":230
+ *                 for i in range(ndim):
+ *                     info.strides[i] = PyArray_STRIDES(self)[i]
+ *                     info.shape[i] = PyArray_DIMS(self)[i]             # <<<<<<<<<<<<<<
+ *             else:
+ *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)
+ */
+      (__pyx_v_info->shape[__pyx_v_i]) = (PyArray_DIMS(__pyx_v_self)[__pyx_v_i]);
+    }
+    goto __pyx_L7;
+  }
+  /*else*/ {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":232
+ *                     info.shape[i] = PyArray_DIMS(self)[i]
+ *             else:
+ *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)             # <<<<<<<<<<<<<<
+ *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)
+ *             info.suboffsets = NULL
+ */
+    __pyx_v_info->strides = ((Py_ssize_t *)PyArray_STRIDES(__pyx_v_self));
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":233
+ *             else:
+ *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)
+ *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)             # <<<<<<<<<<<<<<
+ *             info.suboffsets = NULL
+ *             info.itemsize = PyArray_ITEMSIZE(self)
+ */
+    __pyx_v_info->shape = ((Py_ssize_t *)PyArray_DIMS(__pyx_v_self));
+  }
+  __pyx_L7:;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":234
+ *                 info.strides = <Py_ssize_t*>PyArray_STRIDES(self)
+ *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)
+ *             info.suboffsets = NULL             # <<<<<<<<<<<<<<
+ *             info.itemsize = PyArray_ITEMSIZE(self)
+ *             info.readonly = not PyArray_ISWRITEABLE(self)
+ */
+  __pyx_v_info->suboffsets = NULL;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":235
+ *                 info.shape = <Py_ssize_t*>PyArray_DIMS(self)
+ *             info.suboffsets = NULL
+ *             info.itemsize = PyArray_ITEMSIZE(self)             # <<<<<<<<<<<<<<
+ *             info.readonly = not PyArray_ISWRITEABLE(self)
+ * 
+ */
+  __pyx_v_info->itemsize = PyArray_ITEMSIZE(__pyx_v_self);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":236
+ *             info.suboffsets = NULL
+ *             info.itemsize = PyArray_ITEMSIZE(self)
+ *             info.readonly = not PyArray_ISWRITEABLE(self)             # <<<<<<<<<<<<<<
+ * 
+ *             cdef int t
+ */
+  __pyx_v_info->readonly = (!(PyArray_ISWRITEABLE(__pyx_v_self) != 0));
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":239
+ * 
+ *             cdef int t
+ *             cdef char* f = NULL             # <<<<<<<<<<<<<<
+ *             cdef dtype descr = self.descr
+ *             cdef list stack
+ */
+  __pyx_v_f = NULL;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":240
+ *             cdef int t
+ *             cdef char* f = NULL
+ *             cdef dtype descr = self.descr             # <<<<<<<<<<<<<<
+ *             cdef list stack
+ *             cdef int offset
+ */
+  __pyx_t_4 = ((PyObject *)__pyx_v_self->descr);
+  __Pyx_INCREF(__pyx_t_4);
+  __pyx_v_descr = ((PyArray_Descr *)__pyx_t_4);
+  __pyx_t_4 = 0;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":244
+ *             cdef int offset
+ * 
+ *             cdef bint hasfields = PyDataType_HASFIELDS(descr)             # <<<<<<<<<<<<<<
+ * 
+ *             if not hasfields and not copy_shape:
+ */
+  __pyx_v_hasfields = PyDataType_HASFIELDS(__pyx_v_descr);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":246
+ *             cdef bint hasfields = PyDataType_HASFIELDS(descr)
+ * 
+ *             if not hasfields and not copy_shape:             # <<<<<<<<<<<<<<
+ *                 # do not call releasebuffer
+ *                 info.obj = None
+ */
+  __pyx_t_2 = ((!(__pyx_v_hasfields != 0)) != 0);
+  if (__pyx_t_2) {
+    __pyx_t_3 = ((!(__pyx_v_copy_shape != 0)) != 0);
+    __pyx_t_1 = __pyx_t_3;
+  } else {
+    __pyx_t_1 = __pyx_t_2;
+  }
+  if (__pyx_t_1) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":248
+ *             if not hasfields and not copy_shape:
+ *                 # do not call releasebuffer
+ *                 info.obj = None             # <<<<<<<<<<<<<<
+ *             else:
+ *                 # need to call releasebuffer
+ */
+    __Pyx_INCREF(Py_None);
+    __Pyx_GIVEREF(Py_None);
+    __Pyx_GOTREF(__pyx_v_info->obj);
+    __Pyx_DECREF(__pyx_v_info->obj);
+    __pyx_v_info->obj = Py_None;
+    goto __pyx_L10;
+  }
+  /*else*/ {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":251
+ *             else:
+ *                 # need to call releasebuffer
+ *                 info.obj = self             # <<<<<<<<<<<<<<
+ * 
+ *             if not hasfields:
+ */
+    __Pyx_INCREF(((PyObject *)__pyx_v_self));
+    __Pyx_GIVEREF(((PyObject *)__pyx_v_self));
+    __Pyx_GOTREF(__pyx_v_info->obj);
+    __Pyx_DECREF(__pyx_v_info->obj);
+    __pyx_v_info->obj = ((PyObject *)__pyx_v_self);
+  }
+  __pyx_L10:;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":253
+ *                 info.obj = self
+ * 
+ *             if not hasfields:             # <<<<<<<<<<<<<<
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ */
+  __pyx_t_1 = ((!(__pyx_v_hasfields != 0)) != 0);
+  if (__pyx_t_1) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":254
+ * 
+ *             if not hasfields:
+ *                 t = descr.type_num             # <<<<<<<<<<<<<<
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ */
+    __pyx_t_5 = __pyx_v_descr->type_num;
+    __pyx_v_t = __pyx_t_5;
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":255
+ *             if not hasfields:
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")
+ */
+    __pyx_t_1 = ((__pyx_v_descr->byteorder == '>') != 0);
+    if (__pyx_t_1) {
+      __pyx_t_2 = (__pyx_v_little_endian != 0);
+    } else {
+      __pyx_t_2 = __pyx_t_1;
+    }
+    if (!__pyx_t_2) {
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":256
+ *                 t = descr.type_num
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ *                     (descr.byteorder == c'<' and not little_endian)):             # <<<<<<<<<<<<<<
+ *                     raise ValueError(u"Non-native byte order not supported")
+ *                 if   t == NPY_BYTE:        f = "b"
+ */
+      __pyx_t_1 = ((__pyx_v_descr->byteorder == '<') != 0);
+      if (__pyx_t_1) {
+        __pyx_t_3 = ((!(__pyx_v_little_endian != 0)) != 0);
+        __pyx_t_7 = __pyx_t_3;
+      } else {
+        __pyx_t_7 = __pyx_t_1;
+      }
+      __pyx_t_1 = __pyx_t_7;
+    } else {
+      __pyx_t_1 = __pyx_t_2;
+    }
+    if (__pyx_t_1) {
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":257
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
+ *                 if   t == NPY_BYTE:        f = "b"
+ *                 elif t == NPY_UBYTE:       f = "B"
+ */
+      __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__7, NULL); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 257; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_Raise(__pyx_t_4, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      {__pyx_filename = __pyx_f[1]; __pyx_lineno = 257; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    }
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":274
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"
+ *                 elif t == NPY_CLONGDOUBLE: f = "Zg"
+ *                 elif t == NPY_OBJECT:      f = "O"             # <<<<<<<<<<<<<<
+ *                 else:
+ *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ */
+    switch (__pyx_v_t) {
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":258
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")
+ *                 if   t == NPY_BYTE:        f = "b"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_UBYTE:       f = "B"
+ *                 elif t == NPY_SHORT:       f = "h"
+ */
+      case NPY_BYTE:
+      __pyx_v_f = __pyx_k_b;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":259
+ *                     raise ValueError(u"Non-native byte order not supported")
+ *                 if   t == NPY_BYTE:        f = "b"
+ *                 elif t == NPY_UBYTE:       f = "B"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_SHORT:       f = "h"
+ *                 elif t == NPY_USHORT:      f = "H"
+ */
+      case NPY_UBYTE:
+      __pyx_v_f = __pyx_k_B;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":260
+ *                 if   t == NPY_BYTE:        f = "b"
+ *                 elif t == NPY_UBYTE:       f = "B"
+ *                 elif t == NPY_SHORT:       f = "h"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_USHORT:      f = "H"
+ *                 elif t == NPY_INT:         f = "i"
+ */
+      case NPY_SHORT:
+      __pyx_v_f = __pyx_k_h;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":261
+ *                 elif t == NPY_UBYTE:       f = "B"
+ *                 elif t == NPY_SHORT:       f = "h"
+ *                 elif t == NPY_USHORT:      f = "H"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_INT:         f = "i"
+ *                 elif t == NPY_UINT:        f = "I"
+ */
+      case NPY_USHORT:
+      __pyx_v_f = __pyx_k_H;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":262
+ *                 elif t == NPY_SHORT:       f = "h"
+ *                 elif t == NPY_USHORT:      f = "H"
+ *                 elif t == NPY_INT:         f = "i"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_UINT:        f = "I"
+ *                 elif t == NPY_LONG:        f = "l"
+ */
+      case NPY_INT:
+      __pyx_v_f = __pyx_k_i;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":263
+ *                 elif t == NPY_USHORT:      f = "H"
+ *                 elif t == NPY_INT:         f = "i"
+ *                 elif t == NPY_UINT:        f = "I"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_LONG:        f = "l"
+ *                 elif t == NPY_ULONG:       f = "L"
+ */
+      case NPY_UINT:
+      __pyx_v_f = __pyx_k_I;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":264
+ *                 elif t == NPY_INT:         f = "i"
+ *                 elif t == NPY_UINT:        f = "I"
+ *                 elif t == NPY_LONG:        f = "l"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_ULONG:       f = "L"
+ *                 elif t == NPY_LONGLONG:    f = "q"
+ */
+      case NPY_LONG:
+      __pyx_v_f = __pyx_k_l;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":265
+ *                 elif t == NPY_UINT:        f = "I"
+ *                 elif t == NPY_LONG:        f = "l"
+ *                 elif t == NPY_ULONG:       f = "L"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_LONGLONG:    f = "q"
+ *                 elif t == NPY_ULONGLONG:   f = "Q"
+ */
+      case NPY_ULONG:
+      __pyx_v_f = __pyx_k_L;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":266
+ *                 elif t == NPY_LONG:        f = "l"
+ *                 elif t == NPY_ULONG:       f = "L"
+ *                 elif t == NPY_LONGLONG:    f = "q"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_ULONGLONG:   f = "Q"
+ *                 elif t == NPY_FLOAT:       f = "f"
+ */
+      case NPY_LONGLONG:
+      __pyx_v_f = __pyx_k_q;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":267
+ *                 elif t == NPY_ULONG:       f = "L"
+ *                 elif t == NPY_LONGLONG:    f = "q"
+ *                 elif t == NPY_ULONGLONG:   f = "Q"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_FLOAT:       f = "f"
+ *                 elif t == NPY_DOUBLE:      f = "d"
+ */
+      case NPY_ULONGLONG:
+      __pyx_v_f = __pyx_k_Q;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":268
+ *                 elif t == NPY_LONGLONG:    f = "q"
+ *                 elif t == NPY_ULONGLONG:   f = "Q"
+ *                 elif t == NPY_FLOAT:       f = "f"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_DOUBLE:      f = "d"
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"
+ */
+      case NPY_FLOAT:
+      __pyx_v_f = __pyx_k_f;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":269
+ *                 elif t == NPY_ULONGLONG:   f = "Q"
+ *                 elif t == NPY_FLOAT:       f = "f"
+ *                 elif t == NPY_DOUBLE:      f = "d"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"
+ *                 elif t == NPY_CFLOAT:      f = "Zf"
+ */
+      case NPY_DOUBLE:
+      __pyx_v_f = __pyx_k_d;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":270
+ *                 elif t == NPY_FLOAT:       f = "f"
+ *                 elif t == NPY_DOUBLE:      f = "d"
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_CFLOAT:      f = "Zf"
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"
+ */
+      case NPY_LONGDOUBLE:
+      __pyx_v_f = __pyx_k_g;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":271
+ *                 elif t == NPY_DOUBLE:      f = "d"
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"
+ *                 elif t == NPY_CFLOAT:      f = "Zf"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"
+ *                 elif t == NPY_CLONGDOUBLE: f = "Zg"
+ */
+      case NPY_CFLOAT:
+      __pyx_v_f = __pyx_k_Zf;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":272
+ *                 elif t == NPY_LONGDOUBLE:  f = "g"
+ *                 elif t == NPY_CFLOAT:      f = "Zf"
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_CLONGDOUBLE: f = "Zg"
+ *                 elif t == NPY_OBJECT:      f = "O"
+ */
+      case NPY_CDOUBLE:
+      __pyx_v_f = __pyx_k_Zd;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":273
+ *                 elif t == NPY_CFLOAT:      f = "Zf"
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"
+ *                 elif t == NPY_CLONGDOUBLE: f = "Zg"             # <<<<<<<<<<<<<<
+ *                 elif t == NPY_OBJECT:      f = "O"
+ *                 else:
+ */
+      case NPY_CLONGDOUBLE:
+      __pyx_v_f = __pyx_k_Zg;
+      break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":274
+ *                 elif t == NPY_CDOUBLE:     f = "Zd"
+ *                 elif t == NPY_CLONGDOUBLE: f = "Zg"
+ *                 elif t == NPY_OBJECT:      f = "O"             # <<<<<<<<<<<<<<
+ *                 else:
+ *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ */
+      case NPY_OBJECT:
+      __pyx_v_f = __pyx_k_O;
+      break;
+      default:
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":276
+ *                 elif t == NPY_OBJECT:      f = "O"
+ *                 else:
+ *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)             # <<<<<<<<<<<<<<
+ *                 info.format = f
+ *                 return
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_int(__pyx_v_t); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 276; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_8 = PyUnicode_Format(__pyx_kp_u_unknown_dtype_code_in_numpy_pxd, __pyx_t_4); if (unlikely(!__pyx_t_8)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 276; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_4 = PyTuple_New(1); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 276; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_8);
+      __Pyx_GIVEREF(__pyx_t_8);
+      __pyx_t_8 = 0;
+      __pyx_t_8 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_4, NULL); if (unlikely(!__pyx_t_8)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 276; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_8);
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __Pyx_Raise(__pyx_t_8, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0;
+      {__pyx_filename = __pyx_f[1]; __pyx_lineno = 276; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      break;
+    }
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":277
+ *                 else:
+ *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ *                 info.format = f             # <<<<<<<<<<<<<<
+ *                 return
+ *             else:
+ */
+    __pyx_v_info->format = __pyx_v_f;
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":278
+ *                     raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ *                 info.format = f
+ *                 return             # <<<<<<<<<<<<<<
+ *             else:
+ *                 info.format = <char*>stdlib.malloc(_buffer_format_string_len)
+ */
+    __pyx_r = 0;
+    goto __pyx_L0;
+  }
+  /*else*/ {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":280
+ *                 return
+ *             else:
+ *                 info.format = <char*>stdlib.malloc(_buffer_format_string_len)             # <<<<<<<<<<<<<<
+ *                 info.format[0] = c'^' # Native data types, manual alignment
+ *                 offset = 0
+ */
+    __pyx_v_info->format = ((char *)malloc(255));
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":281
+ *             else:
+ *                 info.format = <char*>stdlib.malloc(_buffer_format_string_len)
+ *                 info.format[0] = c'^' # Native data types, manual alignment             # <<<<<<<<<<<<<<
+ *                 offset = 0
+ *                 f = _util_dtypestring(descr, info.format + 1,
+ */
+    (__pyx_v_info->format[0]) = '^';
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":282
+ *                 info.format = <char*>stdlib.malloc(_buffer_format_string_len)
+ *                 info.format[0] = c'^' # Native data types, manual alignment
+ *                 offset = 0             # <<<<<<<<<<<<<<
+ *                 f = _util_dtypestring(descr, info.format + 1,
+ *                                       info.format + _buffer_format_string_len,
+ */
+    __pyx_v_offset = 0;
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":283
+ *                 info.format[0] = c'^' # Native data types, manual alignment
+ *                 offset = 0
+ *                 f = _util_dtypestring(descr, info.format + 1,             # <<<<<<<<<<<<<<
+ *                                       info.format + _buffer_format_string_len,
+ *                                       &offset)
+ */
+    __pyx_t_9 = __pyx_f_5numpy__util_dtypestring(__pyx_v_descr, (__pyx_v_info->format + 1), (__pyx_v_info->format + 255), (&__pyx_v_offset)); if (unlikely(__pyx_t_9 == NULL)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 283; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __pyx_v_f = __pyx_t_9;
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":286
+ *                                       info.format + _buffer_format_string_len,
+ *                                       &offset)
+ *                 f[0] = c'\0' # Terminate format string             # <<<<<<<<<<<<<<
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):
+ */
+    (__pyx_v_f[0]) = '\x00';
+  }
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":194
+ *         # experimental exception made for __getbuffer__ and __releasebuffer__
+ *         # -- the details of this may change.
+ *         def __getbuffer__(ndarray self, Py_buffer* info, int flags):             # <<<<<<<<<<<<<<
+ *             # This implementation of getbuffer is geared towards Cython
+ *             # requirements, and does not yet fullfill the PEP.
+ */
+
+  /* function exit code */
+  __pyx_r = 0;
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_XDECREF(__pyx_t_8);
+  __Pyx_AddTraceback("numpy.ndarray.__getbuffer__", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = -1;
+  if (__pyx_v_info != NULL && __pyx_v_info->obj != NULL) {
+    __Pyx_GOTREF(__pyx_v_info->obj);
+    __Pyx_DECREF(__pyx_v_info->obj); __pyx_v_info->obj = NULL;
+  }
+  goto __pyx_L2;
+  __pyx_L0:;
+  if (__pyx_v_info != NULL && __pyx_v_info->obj == Py_None) {
+    __Pyx_GOTREF(Py_None);
+    __Pyx_DECREF(Py_None); __pyx_v_info->obj = NULL;
+  }
+  __pyx_L2:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_descr);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":288
+ *                 f[0] = c'\0' # Terminate format string
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):             # <<<<<<<<<<<<<<
+ *             if PyArray_HASFIELDS(self):
+ *                 stdlib.free(info.format)
+ */
+
+/* Python wrapper */
+static CYTHON_UNUSED void __pyx_pw_5numpy_7ndarray_3__releasebuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info); /*proto*/
+static CYTHON_UNUSED void __pyx_pw_5numpy_7ndarray_3__releasebuffer__(PyObject *__pyx_v_self, Py_buffer *__pyx_v_info) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__releasebuffer__ (wrapper)", 0);
+  __pyx_pf_5numpy_7ndarray_2__releasebuffer__(((PyArrayObject *)__pyx_v_self), ((Py_buffer *)__pyx_v_info));
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+static void __pyx_pf_5numpy_7ndarray_2__releasebuffer__(PyArrayObject *__pyx_v_self, Py_buffer *__pyx_v_info) {
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  __Pyx_RefNannySetupContext("__releasebuffer__", 0);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":289
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):
+ *             if PyArray_HASFIELDS(self):             # <<<<<<<<<<<<<<
+ *                 stdlib.free(info.format)
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ */
+  __pyx_t_1 = (PyArray_HASFIELDS(__pyx_v_self) != 0);
+  if (__pyx_t_1) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":290
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):
+ *             if PyArray_HASFIELDS(self):
+ *                 stdlib.free(info.format)             # <<<<<<<<<<<<<<
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ *                 stdlib.free(info.strides)
+ */
+    free(__pyx_v_info->format);
+    goto __pyx_L3;
+  }
+  __pyx_L3:;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":291
+ *             if PyArray_HASFIELDS(self):
+ *                 stdlib.free(info.format)
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):             # <<<<<<<<<<<<<<
+ *                 stdlib.free(info.strides)
+ *                 # info.shape was stored after info.strides in the same block
+ */
+  __pyx_t_1 = (((sizeof(npy_intp)) != (sizeof(Py_ssize_t))) != 0);
+  if (__pyx_t_1) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":292
+ *                 stdlib.free(info.format)
+ *             if sizeof(npy_intp) != sizeof(Py_ssize_t):
+ *                 stdlib.free(info.strides)             # <<<<<<<<<<<<<<
+ *                 # info.shape was stored after info.strides in the same block
+ * 
+ */
+    free(__pyx_v_info->strides);
+    goto __pyx_L4;
+  }
+  __pyx_L4:;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":288
+ *                 f[0] = c'\0' # Terminate format string
+ * 
+ *         def __releasebuffer__(ndarray self, Py_buffer* info):             # <<<<<<<<<<<<<<
+ *             if PyArray_HASFIELDS(self):
+ *                 stdlib.free(info.format)
+ */
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":768
+ * ctypedef npy_cdouble     complex_t
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew1(PyObject *__pyx_v_a) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew1", 0);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":769
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):
+ *     return PyArray_MultiIterNew(1, <void*>a)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(1, ((void *)__pyx_v_a)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 769; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":768
+ * ctypedef npy_cdouble     complex_t
+ * 
+ * cdef inline object PyArray_MultiIterNew1(a):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew1", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":771
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew2(PyObject *__pyx_v_a, PyObject *__pyx_v_b) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew2", 0);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":772
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(2, ((void *)__pyx_v_a), ((void *)__pyx_v_b)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 772; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":771
+ *     return PyArray_MultiIterNew(1, <void*>a)
+ * 
+ * cdef inline object PyArray_MultiIterNew2(a, b):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew2", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":774
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew3(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew3", 0);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":775
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(3, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 775; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":774
+ *     return PyArray_MultiIterNew(2, <void*>a, <void*>b)
+ * 
+ * cdef inline object PyArray_MultiIterNew3(a, b, c):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew3", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":777
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew4(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c, PyObject *__pyx_v_d) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew4", 0);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":778
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(4, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 778; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":777
+ *     return PyArray_MultiIterNew(3, <void*>a, <void*>b, <void*> c)
+ * 
+ * cdef inline object PyArray_MultiIterNew4(a, b, c, d):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew4", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":780
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_PyArray_MultiIterNew5(PyObject *__pyx_v_a, PyObject *__pyx_v_b, PyObject *__pyx_v_c, PyObject *__pyx_v_d, PyObject *__pyx_v_e) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("PyArray_MultiIterNew5", 0);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":781
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL:
+ */
+  __Pyx_XDECREF(__pyx_r);
+  __pyx_t_1 = PyArray_MultiIterNew(5, ((void *)__pyx_v_a), ((void *)__pyx_v_b), ((void *)__pyx_v_c), ((void *)__pyx_v_d), ((void *)__pyx_v_e)); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 781; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  __pyx_r = __pyx_t_1;
+  __pyx_t_1 = 0;
+  goto __pyx_L0;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":780
+ *     return PyArray_MultiIterNew(4, <void*>a, <void*>b, <void*>c, <void*> d)
+ * 
+ * cdef inline object PyArray_MultiIterNew5(a, b, c, d, e):             # <<<<<<<<<<<<<<
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_AddTraceback("numpy.PyArray_MultiIterNew5", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = 0;
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":783
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL:             # <<<<<<<<<<<<<<
+ *     # Recursive utility function used in __getbuffer__ to get format
+ *     # string. The new location in the format string is returned.
+ */
+
+static CYTHON_INLINE char *__pyx_f_5numpy__util_dtypestring(PyArray_Descr *__pyx_v_descr, char *__pyx_v_f, char *__pyx_v_end, int *__pyx_v_offset) {
+  PyArray_Descr *__pyx_v_child = 0;
+  int __pyx_v_endian_detector;
+  int __pyx_v_little_endian;
+  PyObject *__pyx_v_fields = 0;
+  PyObject *__pyx_v_childname = NULL;
+  PyObject *__pyx_v_new_offset = NULL;
+  PyObject *__pyx_v_t = NULL;
+  char *__pyx_r;
+  __Pyx_RefNannyDeclarations
+  PyObject *__pyx_t_1 = NULL;
+  Py_ssize_t __pyx_t_2;
+  PyObject *__pyx_t_3 = NULL;
+  PyObject *__pyx_t_4 = NULL;
+  int __pyx_t_5;
+  int __pyx_t_6;
+  int __pyx_t_7;
+  int __pyx_t_8;
+  int __pyx_t_9;
+  long __pyx_t_10;
+  char *__pyx_t_11;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannySetupContext("_util_dtypestring", 0);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":790
+ *     cdef int delta_offset
+ *     cdef tuple i
+ *     cdef int endian_detector = 1             # <<<<<<<<<<<<<<
+ *     cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)
+ *     cdef tuple fields
+ */
+  __pyx_v_endian_detector = 1;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":791
+ *     cdef tuple i
+ *     cdef int endian_detector = 1
+ *     cdef bint little_endian = ((<char*>&endian_detector)[0] != 0)             # <<<<<<<<<<<<<<
+ *     cdef tuple fields
+ * 
+ */
+  __pyx_v_little_endian = ((((char *)(&__pyx_v_endian_detector))[0]) != 0);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":794
+ *     cdef tuple fields
+ * 
+ *     for childname in descr.names:             # <<<<<<<<<<<<<<
+ *         fields = descr.fields[childname]
+ *         child, new_offset = fields
+ */
+  if (unlikely(__pyx_v_descr->names == Py_None)) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable");
+    {__pyx_filename = __pyx_f[1]; __pyx_lineno = 794; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  }
+  __pyx_t_1 = __pyx_v_descr->names; __Pyx_INCREF(__pyx_t_1); __pyx_t_2 = 0;
+  for (;;) {
+    if (__pyx_t_2 >= PyTuple_GET_SIZE(__pyx_t_1)) break;
+    #if CYTHON_COMPILING_IN_CPYTHON
+    __pyx_t_3 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_2); __Pyx_INCREF(__pyx_t_3); __pyx_t_2++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 794; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    #else
+    __pyx_t_3 = PySequence_ITEM(__pyx_t_1, __pyx_t_2); __pyx_t_2++; if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 794; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    #endif
+    __Pyx_XDECREF_SET(__pyx_v_childname, __pyx_t_3);
+    __pyx_t_3 = 0;
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":795
+ * 
+ *     for childname in descr.names:
+ *         fields = descr.fields[childname]             # <<<<<<<<<<<<<<
+ *         child, new_offset = fields
+ * 
+ */
+    __pyx_t_3 = PyObject_GetItem(__pyx_v_descr->fields, __pyx_v_childname); if (unlikely(__pyx_t_3 == NULL)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 795; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+    __Pyx_GOTREF(__pyx_t_3);
+    if (!(likely(PyTuple_CheckExact(__pyx_t_3))||((__pyx_t_3) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "tuple", Py_TYPE(__pyx_t_3)->tp_name), 0))) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 795; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_XDECREF_SET(__pyx_v_fields, ((PyObject*)__pyx_t_3));
+    __pyx_t_3 = 0;
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":796
+ *     for childname in descr.names:
+ *         fields = descr.fields[childname]
+ *         child, new_offset = fields             # <<<<<<<<<<<<<<
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:
+ */
+    if (likely(__pyx_v_fields != Py_None)) {
+      PyObject* sequence = __pyx_v_fields;
+      #if CYTHON_COMPILING_IN_CPYTHON
+      Py_ssize_t size = Py_SIZE(sequence);
+      #else
+      Py_ssize_t size = PySequence_Size(sequence);
+      #endif
+      if (unlikely(size != 2)) {
+        if (size > 2) __Pyx_RaiseTooManyValuesError(2);
+        else if (size >= 0) __Pyx_RaiseNeedMoreValuesError(size);
+        {__pyx_filename = __pyx_f[1]; __pyx_lineno = 796; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      }
+      #if CYTHON_COMPILING_IN_CPYTHON
+      __pyx_t_3 = PyTuple_GET_ITEM(sequence, 0); 
+      __pyx_t_4 = PyTuple_GET_ITEM(sequence, 1); 
+      __Pyx_INCREF(__pyx_t_3);
+      __Pyx_INCREF(__pyx_t_4);
+      #else
+      __pyx_t_3 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 796; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 796; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      #endif
+    } else {
+      __Pyx_RaiseNoneNotIterableError(); {__pyx_filename = __pyx_f[1]; __pyx_lineno = 796; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    }
+    if (!(likely(((__pyx_t_3) == Py_None) || likely(__Pyx_TypeTest(__pyx_t_3, __pyx_ptype_5numpy_dtype))))) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 796; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_XDECREF_SET(__pyx_v_child, ((PyArray_Descr *)__pyx_t_3));
+    __pyx_t_3 = 0;
+    __Pyx_XDECREF_SET(__pyx_v_new_offset, __pyx_t_4);
+    __pyx_t_4 = 0;
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":798
+ *         child, new_offset = fields
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:             # <<<<<<<<<<<<<<
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ * 
+ */
+    __pyx_t_4 = __Pyx_PyInt_From_int((__pyx_v_offset[0])); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 798; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_4);
+    __pyx_t_3 = PyNumber_Subtract(__pyx_v_new_offset, __pyx_t_4); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 798; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_GOTREF(__pyx_t_3);
+    __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+    __pyx_t_5 = __Pyx_PyInt_As_int(__pyx_t_3); if (unlikely((__pyx_t_5 == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 798; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+    __pyx_t_6 = ((((__pyx_v_end - __pyx_v_f) - ((int)__pyx_t_5)) < 15) != 0);
+    if (__pyx_t_6) {
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":799
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")             # <<<<<<<<<<<<<<
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or
+ */
+      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, __pyx_tuple__8, NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 799; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      {__pyx_filename = __pyx_f[1]; __pyx_lineno = 799; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    }
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":801
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or             # <<<<<<<<<<<<<<
+ *             (child.byteorder == c'<' and not little_endian)):
+ *             raise ValueError(u"Non-native byte order not supported")
+ */
+    __pyx_t_6 = ((__pyx_v_child->byteorder == '>') != 0);
+    if (__pyx_t_6) {
+      __pyx_t_7 = (__pyx_v_little_endian != 0);
+    } else {
+      __pyx_t_7 = __pyx_t_6;
+    }
+    if (!__pyx_t_7) {
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":802
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or
+ *             (child.byteorder == c'<' and not little_endian)):             # <<<<<<<<<<<<<<
+ *             raise ValueError(u"Non-native byte order not supported")
+ *             # One could encode it in the format string and have Cython
+ */
+      __pyx_t_6 = ((__pyx_v_child->byteorder == '<') != 0);
+      if (__pyx_t_6) {
+        __pyx_t_8 = ((!(__pyx_v_little_endian != 0)) != 0);
+        __pyx_t_9 = __pyx_t_8;
+      } else {
+        __pyx_t_9 = __pyx_t_6;
+      }
+      __pyx_t_6 = __pyx_t_9;
+    } else {
+      __pyx_t_6 = __pyx_t_7;
+    }
+    if (__pyx_t_6) {
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":803
+ *         if ((child.byteorder == c'>' and little_endian) or
+ *             (child.byteorder == c'<' and not little_endian)):
+ *             raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
+ *             # One could encode it in the format string and have Cython
+ *             # complain instead, BUT: < and > in format strings also imply
+ */
+      __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_tuple__9, NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 803; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      {__pyx_filename = __pyx_f[1]; __pyx_lineno = 803; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    }
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":813
+ * 
+ *         # Output padding bytes
+ *         while offset[0] < new_offset:             # <<<<<<<<<<<<<<
+ *             f[0] = 120 # "x"; pad byte
+ *             f += 1
+ */
+    while (1) {
+      __pyx_t_3 = __Pyx_PyInt_From_int((__pyx_v_offset[0])); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 813; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_t_3, __pyx_v_new_offset, Py_LT); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 813; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 813; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (!__pyx_t_6) break;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":814
+ *         # Output padding bytes
+ *         while offset[0] < new_offset:
+ *             f[0] = 120 # "x"; pad byte             # <<<<<<<<<<<<<<
+ *             f += 1
+ *             offset[0] += 1
+ */
+      (__pyx_v_f[0]) = 120;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":815
+ *         while offset[0] < new_offset:
+ *             f[0] = 120 # "x"; pad byte
+ *             f += 1             # <<<<<<<<<<<<<<
+ *             offset[0] += 1
+ * 
+ */
+      __pyx_v_f = (__pyx_v_f + 1);
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":816
+ *             f[0] = 120 # "x"; pad byte
+ *             f += 1
+ *             offset[0] += 1             # <<<<<<<<<<<<<<
+ * 
+ *         offset[0] += child.itemsize
+ */
+      __pyx_t_10 = 0;
+      (__pyx_v_offset[__pyx_t_10]) = ((__pyx_v_offset[__pyx_t_10]) + 1);
+    }
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":818
+ *             offset[0] += 1
+ * 
+ *         offset[0] += child.itemsize             # <<<<<<<<<<<<<<
+ * 
+ *         if not PyDataType_HASFIELDS(child):
+ */
+    __pyx_t_10 = 0;
+    (__pyx_v_offset[__pyx_t_10]) = ((__pyx_v_offset[__pyx_t_10]) + __pyx_v_child->elsize);
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":820
+ *         offset[0] += child.itemsize
+ * 
+ *         if not PyDataType_HASFIELDS(child):             # <<<<<<<<<<<<<<
+ *             t = child.type_num
+ *             if end - f < 5:
+ */
+    __pyx_t_6 = ((!(PyDataType_HASFIELDS(__pyx_v_child) != 0)) != 0);
+    if (__pyx_t_6) {
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":821
+ * 
+ *         if not PyDataType_HASFIELDS(child):
+ *             t = child.type_num             # <<<<<<<<<<<<<<
+ *             if end - f < 5:
+ *                 raise RuntimeError(u"Format string allocated too short.")
+ */
+      __pyx_t_4 = __Pyx_PyInt_From_int(__pyx_v_child->type_num); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 821; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __Pyx_XDECREF_SET(__pyx_v_t, __pyx_t_4);
+      __pyx_t_4 = 0;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":822
+ *         if not PyDataType_HASFIELDS(child):
+ *             t = child.type_num
+ *             if end - f < 5:             # <<<<<<<<<<<<<<
+ *                 raise RuntimeError(u"Format string allocated too short.")
+ * 
+ */
+      __pyx_t_6 = (((__pyx_v_end - __pyx_v_f) < 5) != 0);
+      if (__pyx_t_6) {
+
+        /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":823
+ *             t = child.type_num
+ *             if end - f < 5:
+ *                 raise RuntimeError(u"Format string allocated too short.")             # <<<<<<<<<<<<<<
+ * 
+ *             # Until ticket #99 is fixed, use integers to avoid warnings
+ */
+        __pyx_t_4 = __Pyx_PyObject_Call(__pyx_builtin_RuntimeError, __pyx_tuple__10, NULL); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 823; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_4);
+        __Pyx_Raise(__pyx_t_4, 0, 0, 0);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+        {__pyx_filename = __pyx_f[1]; __pyx_lineno = 823; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":826
+ * 
+ *             # Until ticket #99 is fixed, use integers to avoid warnings
+ *             if   t == NPY_BYTE:        f[0] =  98 #"b"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_UBYTE:       f[0] =  66 #"B"
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"
+ */
+      __pyx_t_4 = PyInt_FromLong(NPY_BYTE); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 826; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 826; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 826; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 98;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":827
+ *             # Until ticket #99 is fixed, use integers to avoid warnings
+ *             if   t == NPY_BYTE:        f[0] =  98 #"b"
+ *             elif t == NPY_UBYTE:       f[0] =  66 #"B"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"
+ */
+      __pyx_t_3 = PyInt_FromLong(NPY_UBYTE); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 827; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 827; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 827; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 66;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":828
+ *             if   t == NPY_BYTE:        f[0] =  98 #"b"
+ *             elif t == NPY_UBYTE:       f[0] =  66 #"B"
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"
+ *             elif t == NPY_INT:         f[0] = 105 #"i"
+ */
+      __pyx_t_4 = PyInt_FromLong(NPY_SHORT); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 828; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 828; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 828; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 104;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":829
+ *             elif t == NPY_UBYTE:       f[0] =  66 #"B"
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_INT:         f[0] = 105 #"i"
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"
+ */
+      __pyx_t_3 = PyInt_FromLong(NPY_USHORT); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 829; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 829; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 829; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 72;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":830
+ *             elif t == NPY_SHORT:       f[0] = 104 #"h"
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"
+ *             elif t == NPY_INT:         f[0] = 105 #"i"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"
+ */
+      __pyx_t_4 = PyInt_FromLong(NPY_INT); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 830; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 830; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 830; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 105;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":831
+ *             elif t == NPY_USHORT:      f[0] =  72 #"H"
+ *             elif t == NPY_INT:         f[0] = 105 #"i"
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"
+ */
+      __pyx_t_3 = PyInt_FromLong(NPY_UINT); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 831; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 831; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 831; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 73;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":832
+ *             elif t == NPY_INT:         f[0] = 105 #"i"
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
+ */
+      __pyx_t_4 = PyInt_FromLong(NPY_LONG); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 832; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 832; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 832; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 108;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":833
+ *             elif t == NPY_UINT:        f[0] =  73 #"I"
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
+ */
+      __pyx_t_3 = PyInt_FromLong(NPY_ULONG); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 833; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 833; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 833; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 76;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":834
+ *             elif t == NPY_LONG:        f[0] = 108 #"l"
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
+ */
+      __pyx_t_4 = PyInt_FromLong(NPY_LONGLONG); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 834; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 834; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 834; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 113;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":835
+ *             elif t == NPY_ULONG:       f[0] = 76  #"L"
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
+ */
+      __pyx_t_3 = PyInt_FromLong(NPY_ULONGLONG); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 835; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 835; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 835; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 81;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":836
+ *             elif t == NPY_LONGLONG:    f[0] = 113 #"q"
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
+ */
+      __pyx_t_4 = PyInt_FromLong(NPY_FLOAT); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 836; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 836; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 836; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 102;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":837
+ *             elif t == NPY_ULONGLONG:   f[0] = 81  #"Q"
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
+ */
+      __pyx_t_3 = PyInt_FromLong(NPY_DOUBLE); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 837; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 837; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 837; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 100;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":838
+ *             elif t == NPY_FLOAT:       f[0] = 102 #"f"
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"             # <<<<<<<<<<<<<<
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
+ */
+      __pyx_t_4 = PyInt_FromLong(NPY_LONGDOUBLE); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 838; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 838; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 838; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 103;
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":839
+ *             elif t == NPY_DOUBLE:      f[0] = 100 #"d"
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf             # <<<<<<<<<<<<<<
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
+ *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg
+ */
+      __pyx_t_3 = PyInt_FromLong(NPY_CFLOAT); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 839; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 839; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 839; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 90;
+        (__pyx_v_f[1]) = 102;
+        __pyx_v_f = (__pyx_v_f + 1);
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":840
+ *             elif t == NPY_LONGDOUBLE:  f[0] = 103 #"g"
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd             # <<<<<<<<<<<<<<
+ *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg
+ *             elif t == NPY_OBJECT:      f[0] = 79 #"O"
+ */
+      __pyx_t_4 = PyInt_FromLong(NPY_CDOUBLE); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 840; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 840; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 840; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 90;
+        (__pyx_v_f[1]) = 100;
+        __pyx_v_f = (__pyx_v_f + 1);
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":841
+ *             elif t == NPY_CFLOAT:      f[0] = 90; f[1] = 102; f += 1 # Zf
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
+ *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg             # <<<<<<<<<<<<<<
+ *             elif t == NPY_OBJECT:      f[0] = 79 #"O"
+ *             else:
+ */
+      __pyx_t_3 = PyInt_FromLong(NPY_CLONGDOUBLE); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 841; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_3);
+      __pyx_t_4 = PyObject_RichCompare(__pyx_v_t, __pyx_t_3, Py_EQ); __Pyx_XGOTREF(__pyx_t_4); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 841; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_4); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 841; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 90;
+        (__pyx_v_f[1]) = 103;
+        __pyx_v_f = (__pyx_v_f + 1);
+        goto __pyx_L11;
+      }
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":842
+ *             elif t == NPY_CDOUBLE:     f[0] = 90; f[1] = 100; f += 1 # Zd
+ *             elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg
+ *             elif t == NPY_OBJECT:      f[0] = 79 #"O"             # <<<<<<<<<<<<<<
+ *             else:
+ *                 raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ */
+      __pyx_t_4 = PyInt_FromLong(NPY_OBJECT); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 842; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_GOTREF(__pyx_t_4);
+      __pyx_t_3 = PyObject_RichCompare(__pyx_v_t, __pyx_t_4, Py_EQ); __Pyx_XGOTREF(__pyx_t_3); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 842; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+      __pyx_t_6 = __Pyx_PyObject_IsTrue(__pyx_t_3); if (unlikely(__pyx_t_6 < 0)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 842; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+      if (__pyx_t_6) {
+        (__pyx_v_f[0]) = 79;
+        goto __pyx_L11;
+      }
+      /*else*/ {
+
+        /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":844
+ *             elif t == NPY_OBJECT:      f[0] = 79 #"O"
+ *             else:
+ *                 raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)             # <<<<<<<<<<<<<<
+ *             f += 1
+ *         else:
+ */
+        __pyx_t_3 = PyUnicode_Format(__pyx_kp_u_unknown_dtype_code_in_numpy_pxd, __pyx_v_t); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 844; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_3);
+        __pyx_t_4 = PyTuple_New(1); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 844; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_4);
+        PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_3);
+        __Pyx_GIVEREF(__pyx_t_3);
+        __pyx_t_3 = 0;
+        __pyx_t_3 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_4, NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 844; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+        __Pyx_GOTREF(__pyx_t_3);
+        __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0;
+        __Pyx_Raise(__pyx_t_3, 0, 0, 0);
+        __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
+        {__pyx_filename = __pyx_f[1]; __pyx_lineno = 844; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      }
+      __pyx_L11:;
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":845
+ *             else:
+ *                 raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t)
+ *             f += 1             # <<<<<<<<<<<<<<
+ *         else:
+ *             # Cython ignores struct boundary information ("T{...}"),
+ */
+      __pyx_v_f = (__pyx_v_f + 1);
+      goto __pyx_L9;
+    }
+    /*else*/ {
+
+      /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":849
+ *             # Cython ignores struct boundary information ("T{...}"),
+ *             # so don't output it
+ *             f = _util_dtypestring(child, f, end, offset)             # <<<<<<<<<<<<<<
+ *     return f
+ * 
+ */
+      __pyx_t_11 = __pyx_f_5numpy__util_dtypestring(__pyx_v_child, __pyx_v_f, __pyx_v_end, __pyx_v_offset); if (unlikely(__pyx_t_11 == NULL)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 849; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+      __pyx_v_f = __pyx_t_11;
+    }
+    __pyx_L9:;
+  }
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":850
+ *             # so don't output it
+ *             f = _util_dtypestring(child, f, end, offset)
+ *     return f             # <<<<<<<<<<<<<<
+ * 
+ * 
+ */
+  __pyx_r = __pyx_v_f;
+  goto __pyx_L0;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":783
+ *     return PyArray_MultiIterNew(5, <void*>a, <void*>b, <void*>c, <void*> d, <void*> e)
+ * 
+ * cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL:             # <<<<<<<<<<<<<<
+ *     # Recursive utility function used in __getbuffer__ to get format
+ *     # string. The new location in the format string is returned.
+ */
+
+  /* function exit code */
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  __Pyx_XDECREF(__pyx_t_3);
+  __Pyx_XDECREF(__pyx_t_4);
+  __Pyx_AddTraceback("numpy._util_dtypestring", __pyx_clineno, __pyx_lineno, __pyx_filename);
+  __pyx_r = NULL;
+  __pyx_L0:;
+  __Pyx_XDECREF((PyObject *)__pyx_v_child);
+  __Pyx_XDECREF(__pyx_v_fields);
+  __Pyx_XDECREF(__pyx_v_childname);
+  __Pyx_XDECREF(__pyx_v_new_offset);
+  __Pyx_XDECREF(__pyx_v_t);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":966
+ * 
+ * 
+ * cdef inline void set_array_base(ndarray arr, object base):             # <<<<<<<<<<<<<<
+ *      cdef PyObject* baseptr
+ *      if base is None:
+ */
+
+static CYTHON_INLINE void __pyx_f_5numpy_set_array_base(PyArrayObject *__pyx_v_arr, PyObject *__pyx_v_base) {
+  PyObject *__pyx_v_baseptr;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  int __pyx_t_2;
+  __Pyx_RefNannySetupContext("set_array_base", 0);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":968
+ * cdef inline void set_array_base(ndarray arr, object base):
+ *      cdef PyObject* baseptr
+ *      if base is None:             # <<<<<<<<<<<<<<
+ *          baseptr = NULL
+ *      else:
+ */
+  __pyx_t_1 = (__pyx_v_base == Py_None);
+  __pyx_t_2 = (__pyx_t_1 != 0);
+  if (__pyx_t_2) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":969
+ *      cdef PyObject* baseptr
+ *      if base is None:
+ *          baseptr = NULL             # <<<<<<<<<<<<<<
+ *      else:
+ *          Py_INCREF(base) # important to do this before decref below!
+ */
+    __pyx_v_baseptr = NULL;
+    goto __pyx_L3;
+  }
+  /*else*/ {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":971
+ *          baseptr = NULL
+ *      else:
+ *          Py_INCREF(base) # important to do this before decref below!             # <<<<<<<<<<<<<<
+ *          baseptr = <PyObject*>base
+ *      Py_XDECREF(arr.base)
+ */
+    Py_INCREF(__pyx_v_base);
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":972
+ *      else:
+ *          Py_INCREF(base) # important to do this before decref below!
+ *          baseptr = <PyObject*>base             # <<<<<<<<<<<<<<
+ *      Py_XDECREF(arr.base)
+ *      arr.base = baseptr
+ */
+    __pyx_v_baseptr = ((PyObject *)__pyx_v_base);
+  }
+  __pyx_L3:;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":973
+ *          Py_INCREF(base) # important to do this before decref below!
+ *          baseptr = <PyObject*>base
+ *      Py_XDECREF(arr.base)             # <<<<<<<<<<<<<<
+ *      arr.base = baseptr
+ * 
+ */
+  Py_XDECREF(__pyx_v_arr->base);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":974
+ *          baseptr = <PyObject*>base
+ *      Py_XDECREF(arr.base)
+ *      arr.base = baseptr             # <<<<<<<<<<<<<<
+ * 
+ * cdef inline object get_array_base(ndarray arr):
+ */
+  __pyx_v_arr->base = __pyx_v_baseptr;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":966
+ * 
+ * 
+ * cdef inline void set_array_base(ndarray arr, object base):             # <<<<<<<<<<<<<<
+ *      cdef PyObject* baseptr
+ *      if base is None:
+ */
+
+  /* function exit code */
+  __Pyx_RefNannyFinishContext();
+}
+
+/* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":976
+ *      arr.base = baseptr
+ * 
+ * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
+ *     if arr.base is NULL:
+ *         return None
+ */
+
+static CYTHON_INLINE PyObject *__pyx_f_5numpy_get_array_base(PyArrayObject *__pyx_v_arr) {
+  PyObject *__pyx_r = NULL;
+  __Pyx_RefNannyDeclarations
+  int __pyx_t_1;
+  __Pyx_RefNannySetupContext("get_array_base", 0);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":977
+ * 
+ * cdef inline object get_array_base(ndarray arr):
+ *     if arr.base is NULL:             # <<<<<<<<<<<<<<
+ *         return None
+ *     else:
+ */
+  __pyx_t_1 = ((__pyx_v_arr->base == NULL) != 0);
+  if (__pyx_t_1) {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":978
+ * cdef inline object get_array_base(ndarray arr):
+ *     if arr.base is NULL:
+ *         return None             # <<<<<<<<<<<<<<
+ *     else:
+ *         return <object>arr.base
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(Py_None);
+    __pyx_r = Py_None;
+    goto __pyx_L0;
+  }
+  /*else*/ {
+
+    /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":980
+ *         return None
+ *     else:
+ *         return <object>arr.base             # <<<<<<<<<<<<<<
+ */
+    __Pyx_XDECREF(__pyx_r);
+    __Pyx_INCREF(((PyObject *)__pyx_v_arr->base));
+    __pyx_r = ((PyObject *)__pyx_v_arr->base);
+    goto __pyx_L0;
+  }
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":976
+ *      arr.base = baseptr
+ * 
+ * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
+ *     if arr.base is NULL:
+ *         return None
+ */
+
+  /* function exit code */
+  __pyx_L0:;
+  __Pyx_XGIVEREF(__pyx_r);
+  __Pyx_RefNannyFinishContext();
+  return __pyx_r;
+}
+
+static PyMethodDef __pyx_methods[] = {
+  {0, 0, 0, 0}
+};
+
+#if PY_MAJOR_VERSION >= 3
+static struct PyModuleDef __pyx_moduledef = {
+  #if PY_VERSION_HEX < 0x03020000
+    { PyObject_HEAD_INIT(NULL) NULL, 0, NULL },
+  #else
+    PyModuleDef_HEAD_INIT,
+  #endif
+    __Pyx_NAMESTR("gpu_nms"),
+    0, /* m_doc */
+    -1, /* m_size */
+    __pyx_methods /* m_methods */,
+    NULL, /* m_reload */
+    NULL, /* m_traverse */
+    NULL, /* m_clear */
+    NULL /* m_free */
+};
+#endif
+
+static __Pyx_StringTabEntry __pyx_string_tab[] = {
+  {&__pyx_kp_u_Format_string_allocated_too_shor, __pyx_k_Format_string_allocated_too_shor, sizeof(__pyx_k_Format_string_allocated_too_shor), 0, 1, 0, 0},
+  {&__pyx_kp_u_Format_string_allocated_too_shor_2, __pyx_k_Format_string_allocated_too_shor_2, sizeof(__pyx_k_Format_string_allocated_too_shor_2), 0, 1, 0, 0},
+  {&__pyx_kp_u_Non_native_byte_order_not_suppor, __pyx_k_Non_native_byte_order_not_suppor, sizeof(__pyx_k_Non_native_byte_order_not_suppor), 0, 1, 0, 0},
+  {&__pyx_n_s_RuntimeError, __pyx_k_RuntimeError, sizeof(__pyx_k_RuntimeError), 0, 0, 1, 1},
+  {&__pyx_n_s_ValueError, __pyx_k_ValueError, sizeof(__pyx_k_ValueError), 0, 0, 1, 1},
+  {&__pyx_n_s_argsort, __pyx_k_argsort, sizeof(__pyx_k_argsort), 0, 0, 1, 1},
+  {&__pyx_n_s_boxes_dim, __pyx_k_boxes_dim, sizeof(__pyx_k_boxes_dim), 0, 0, 1, 1},
+  {&__pyx_n_s_boxes_num, __pyx_k_boxes_num, sizeof(__pyx_k_boxes_num), 0, 0, 1, 1},
+  {&__pyx_n_s_dets, __pyx_k_dets, sizeof(__pyx_k_dets), 0, 0, 1, 1},
+  {&__pyx_n_s_device_id, __pyx_k_device_id, sizeof(__pyx_k_device_id), 0, 0, 1, 1},
+  {&__pyx_n_s_dtype, __pyx_k_dtype, sizeof(__pyx_k_dtype), 0, 0, 1, 1},
+  {&__pyx_n_s_gpu_nms, __pyx_k_gpu_nms, sizeof(__pyx_k_gpu_nms), 0, 0, 1, 1},
+  {&__pyx_n_s_import, __pyx_k_import, sizeof(__pyx_k_import), 0, 0, 1, 1},
+  {&__pyx_n_s_int32, __pyx_k_int32, sizeof(__pyx_k_int32), 0, 0, 1, 1},
+  {&__pyx_n_s_keep, __pyx_k_keep, sizeof(__pyx_k_keep), 0, 0, 1, 1},
+  {&__pyx_n_s_main, __pyx_k_main, sizeof(__pyx_k_main), 0, 0, 1, 1},
+  {&__pyx_kp_u_ndarray_is_not_C_contiguous, __pyx_k_ndarray_is_not_C_contiguous, sizeof(__pyx_k_ndarray_is_not_C_contiguous), 0, 1, 0, 0},
+  {&__pyx_kp_u_ndarray_is_not_Fortran_contiguou, __pyx_k_ndarray_is_not_Fortran_contiguou, sizeof(__pyx_k_ndarray_is_not_Fortran_contiguou), 0, 1, 0, 0},
+  {&__pyx_kp_s_nfs_yoda_xinleic_Inf_Code_Faste, __pyx_k_nfs_yoda_xinleic_Inf_Code_Faste, sizeof(__pyx_k_nfs_yoda_xinleic_Inf_Code_Faste), 0, 0, 1, 0},
+  {&__pyx_n_s_nms_gpu_nms, __pyx_k_nms_gpu_nms, sizeof(__pyx_k_nms_gpu_nms), 0, 0, 1, 1},
+  {&__pyx_n_s_np, __pyx_k_np, sizeof(__pyx_k_np), 0, 0, 1, 1},
+  {&__pyx_n_s_num_out, __pyx_k_num_out, sizeof(__pyx_k_num_out), 0, 0, 1, 1},
+  {&__pyx_n_s_numpy, __pyx_k_numpy, sizeof(__pyx_k_numpy), 0, 0, 1, 1},
+  {&__pyx_n_s_order, __pyx_k_order, sizeof(__pyx_k_order), 0, 0, 1, 1},
+  {&__pyx_n_s_pyx_getbuffer, __pyx_k_pyx_getbuffer, sizeof(__pyx_k_pyx_getbuffer), 0, 0, 1, 1},
+  {&__pyx_n_s_pyx_releasebuffer, __pyx_k_pyx_releasebuffer, sizeof(__pyx_k_pyx_releasebuffer), 0, 0, 1, 1},
+  {&__pyx_n_s_range, __pyx_k_range, sizeof(__pyx_k_range), 0, 0, 1, 1},
+  {&__pyx_n_s_scores, __pyx_k_scores, sizeof(__pyx_k_scores), 0, 0, 1, 1},
+  {&__pyx_n_s_sorted_dets, __pyx_k_sorted_dets, sizeof(__pyx_k_sorted_dets), 0, 0, 1, 1},
+  {&__pyx_n_s_test, __pyx_k_test, sizeof(__pyx_k_test), 0, 0, 1, 1},
+  {&__pyx_n_s_thresh, __pyx_k_thresh, sizeof(__pyx_k_thresh), 0, 0, 1, 1},
+  {&__pyx_kp_u_unknown_dtype_code_in_numpy_pxd, __pyx_k_unknown_dtype_code_in_numpy_pxd, sizeof(__pyx_k_unknown_dtype_code_in_numpy_pxd), 0, 1, 0, 0},
+  {&__pyx_n_s_zeros, __pyx_k_zeros, sizeof(__pyx_k_zeros), 0, 0, 1, 1},
+  {0, 0, 0, 0, 0, 0, 0}
+};
+static int __Pyx_InitCachedBuiltins(void) {
+  __pyx_builtin_ValueError = __Pyx_GetBuiltinName(__pyx_n_s_ValueError); if (!__pyx_builtin_ValueError) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 215; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_builtin_range = __Pyx_GetBuiltinName(__pyx_n_s_range); if (!__pyx_builtin_range) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 228; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_builtin_RuntimeError = __Pyx_GetBuiltinName(__pyx_n_s_RuntimeError); if (!__pyx_builtin_RuntimeError) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 799; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  return 0;
+  __pyx_L1_error:;
+  return -1;
+}
+
+static int __Pyx_InitCachedConstants(void) {
+  __Pyx_RefNannyDeclarations
+  __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0);
+
+  /* "nms/gpu_nms.pyx":24
+ *         keep = np.zeros(boxes_num, dtype=np.int32)
+ *     cdef np.ndarray[np.float32_t, ndim=1] \
+ *         scores = dets[:, 4]             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.int_t, ndim=1] \
+ *         order = scores.argsort()[::-1]
+ */
+  __pyx_slice_ = PySlice_New(Py_None, Py_None, Py_None); if (unlikely(!__pyx_slice_)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 24; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_slice_);
+  __Pyx_GIVEREF(__pyx_slice_);
+  __pyx_tuple__2 = PyTuple_Pack(2, __pyx_slice_, __pyx_int_4); if (unlikely(!__pyx_tuple__2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 24; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__2);
+  __Pyx_GIVEREF(__pyx_tuple__2);
+
+  /* "nms/gpu_nms.pyx":26
+ *         scores = dets[:, 4]
+ *     cdef np.ndarray[np.int_t, ndim=1] \
+ *         order = scores.argsort()[::-1]             # <<<<<<<<<<<<<<
+ *     cdef np.ndarray[np.float32_t, ndim=2] \
+ *         sorted_dets = dets[order, :]
+ */
+  __pyx_slice__3 = PySlice_New(Py_None, Py_None, __pyx_int_neg_1); if (unlikely(!__pyx_slice__3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 26; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_slice__3);
+  __Pyx_GIVEREF(__pyx_slice__3);
+
+  /* "nms/gpu_nms.pyx":28
+ *         order = scores.argsort()[::-1]
+ *     cdef np.ndarray[np.float32_t, ndim=2] \
+ *         sorted_dets = dets[order, :]             # <<<<<<<<<<<<<<
+ *     _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id)
+ *     keep = keep[:num_out]
+ */
+  __pyx_slice__4 = PySlice_New(Py_None, Py_None, Py_None); if (unlikely(!__pyx_slice__4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_slice__4);
+  __Pyx_GIVEREF(__pyx_slice__4);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":215
+ *             if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not C contiguous")             # <<<<<<<<<<<<<<
+ * 
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ */
+  __pyx_tuple__5 = PyTuple_Pack(1, __pyx_kp_u_ndarray_is_not_C_contiguous); if (unlikely(!__pyx_tuple__5)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 215; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__5);
+  __Pyx_GIVEREF(__pyx_tuple__5);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":219
+ *             if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS)
+ *                 and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)):
+ *                 raise ValueError(u"ndarray is not Fortran contiguous")             # <<<<<<<<<<<<<<
+ * 
+ *             info.buf = PyArray_DATA(self)
+ */
+  __pyx_tuple__6 = PyTuple_Pack(1, __pyx_kp_u_ndarray_is_not_Fortran_contiguou); if (unlikely(!__pyx_tuple__6)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 219; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__6);
+  __Pyx_GIVEREF(__pyx_tuple__6);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":257
+ *                 if ((descr.byteorder == c'>' and little_endian) or
+ *                     (descr.byteorder == c'<' and not little_endian)):
+ *                     raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
+ *                 if   t == NPY_BYTE:        f = "b"
+ *                 elif t == NPY_UBYTE:       f = "B"
+ */
+  __pyx_tuple__7 = PyTuple_Pack(1, __pyx_kp_u_Non_native_byte_order_not_suppor); if (unlikely(!__pyx_tuple__7)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 257; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__7);
+  __Pyx_GIVEREF(__pyx_tuple__7);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":799
+ * 
+ *         if (end - f) - <int>(new_offset - offset[0]) < 15:
+ *             raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd")             # <<<<<<<<<<<<<<
+ * 
+ *         if ((child.byteorder == c'>' and little_endian) or
+ */
+  __pyx_tuple__8 = PyTuple_Pack(1, __pyx_kp_u_Format_string_allocated_too_shor); if (unlikely(!__pyx_tuple__8)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 799; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__8);
+  __Pyx_GIVEREF(__pyx_tuple__8);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":803
+ *         if ((child.byteorder == c'>' and little_endian) or
+ *             (child.byteorder == c'<' and not little_endian)):
+ *             raise ValueError(u"Non-native byte order not supported")             # <<<<<<<<<<<<<<
+ *             # One could encode it in the format string and have Cython
+ *             # complain instead, BUT: < and > in format strings also imply
+ */
+  __pyx_tuple__9 = PyTuple_Pack(1, __pyx_kp_u_Non_native_byte_order_not_suppor); if (unlikely(!__pyx_tuple__9)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 803; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__9);
+  __Pyx_GIVEREF(__pyx_tuple__9);
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":823
+ *             t = child.type_num
+ *             if end - f < 5:
+ *                 raise RuntimeError(u"Format string allocated too short.")             # <<<<<<<<<<<<<<
+ * 
+ *             # Until ticket #99 is fixed, use integers to avoid warnings
+ */
+  __pyx_tuple__10 = PyTuple_Pack(1, __pyx_kp_u_Format_string_allocated_too_shor_2); if (unlikely(!__pyx_tuple__10)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 823; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__10);
+  __Pyx_GIVEREF(__pyx_tuple__10);
+
+  /* "nms/gpu_nms.pyx":16
+ *     void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int)
+ * 
+ * def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,             # <<<<<<<<<<<<<<
+ *             np.int32_t device_id=0):
+ *     cdef int boxes_num = dets.shape[0]
+ */
+  __pyx_tuple__11 = PyTuple_Pack(10, __pyx_n_s_dets, __pyx_n_s_thresh, __pyx_n_s_device_id, __pyx_n_s_boxes_num, __pyx_n_s_boxes_dim, __pyx_n_s_num_out, __pyx_n_s_keep, __pyx_n_s_scores, __pyx_n_s_order, __pyx_n_s_sorted_dets); if (unlikely(!__pyx_tuple__11)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_tuple__11);
+  __Pyx_GIVEREF(__pyx_tuple__11);
+  __pyx_codeobj__12 = (PyObject*)__Pyx_PyCode_New(3, 0, 10, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__11, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_nfs_yoda_xinleic_Inf_Code_Faste, __pyx_n_s_gpu_nms, 16, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__12)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_RefNannyFinishContext();
+  return 0;
+  __pyx_L1_error:;
+  __Pyx_RefNannyFinishContext();
+  return -1;
+}
+
+static int __Pyx_InitGlobals(void) {
+  if (__Pyx_InitStrings(__pyx_string_tab) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+  __pyx_int_4 = PyInt_FromLong(4); if (unlikely(!__pyx_int_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_int_neg_1 = PyInt_FromLong(-1); if (unlikely(!__pyx_int_neg_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  return 0;
+  __pyx_L1_error:;
+  return -1;
+}
+
+#if PY_MAJOR_VERSION < 3
+PyMODINIT_FUNC initgpu_nms(void); /*proto*/
+PyMODINIT_FUNC initgpu_nms(void)
+#else
+PyMODINIT_FUNC PyInit_gpu_nms(void); /*proto*/
+PyMODINIT_FUNC PyInit_gpu_nms(void)
+#endif
+{
+  PyObject *__pyx_t_1 = NULL;
+  int __pyx_lineno = 0;
+  const char *__pyx_filename = NULL;
+  int __pyx_clineno = 0;
+  __Pyx_RefNannyDeclarations
+  #if CYTHON_REFNANNY
+  __Pyx_RefNanny = __Pyx_RefNannyImportAPI("refnanny");
+  if (!__Pyx_RefNanny) {
+      PyErr_Clear();
+      __Pyx_RefNanny = __Pyx_RefNannyImportAPI("Cython.Runtime.refnanny");
+      if (!__Pyx_RefNanny)
+          Py_FatalError("failed to import 'refnanny' module");
+  }
+  #endif
+  __Pyx_RefNannySetupContext("PyMODINIT_FUNC PyInit_gpu_nms(void)", 0);
+  if ( __Pyx_check_binary_version() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_empty_tuple = PyTuple_New(0); if (unlikely(!__pyx_empty_tuple)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_empty_bytes = PyBytes_FromStringAndSize("", 0); if (unlikely(!__pyx_empty_bytes)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  #ifdef __Pyx_CyFunction_USED
+  if (__Pyx_CyFunction_init() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  #endif
+  #ifdef __Pyx_FusedFunction_USED
+  if (__pyx_FusedFunction_init() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  #endif
+  #ifdef __Pyx_Generator_USED
+  if (__pyx_Generator_init() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  #endif
+  /*--- Library function declarations ---*/
+  /*--- Threads initialization code ---*/
+  #if defined(__PYX_FORCE_INIT_THREADS) && __PYX_FORCE_INIT_THREADS
+  #ifdef WITH_THREAD /* Python build with threading support? */
+  PyEval_InitThreads();
+  #endif
+  #endif
+  /*--- Module creation code ---*/
+  #if PY_MAJOR_VERSION < 3
+  __pyx_m = Py_InitModule4(__Pyx_NAMESTR("gpu_nms"), __pyx_methods, 0, 0, PYTHON_API_VERSION); Py_XINCREF(__pyx_m);
+  #else
+  __pyx_m = PyModule_Create(&__pyx_moduledef);
+  #endif
+  if (unlikely(!__pyx_m)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_d = PyModule_GetDict(__pyx_m); if (unlikely(!__pyx_d)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  Py_INCREF(__pyx_d);
+  __pyx_b = PyImport_AddModule(__Pyx_NAMESTR(__Pyx_BUILTIN_MODULE_NAME)); if (unlikely(!__pyx_b)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  #if CYTHON_COMPILING_IN_PYPY
+  Py_INCREF(__pyx_b);
+  #endif
+  if (__Pyx_SetAttrString(__pyx_m, "__builtins__", __pyx_b) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+  /*--- Initialize various global constants etc. ---*/
+  if (unlikely(__Pyx_InitGlobals() < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  #if PY_MAJOR_VERSION < 3 && (__PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT)
+  if (__Pyx_init_sys_getdefaultencoding_params() < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  #endif
+  if (__pyx_module_is_main_nms__gpu_nms) {
+    if (__Pyx_SetAttrString(__pyx_m, "__name__", __pyx_n_s_main) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;};
+  }
+  #if PY_MAJOR_VERSION >= 3
+  {
+    PyObject *modules = PyImport_GetModuleDict(); if (unlikely(!modules)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    if (!PyDict_GetItemString(modules, "nms.gpu_nms")) {
+      if (unlikely(PyDict_SetItemString(modules, "nms.gpu_nms", __pyx_m) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    }
+  }
+  #endif
+  /*--- Builtin init code ---*/
+  if (unlikely(__Pyx_InitCachedBuiltins() < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  /*--- Constants init code ---*/
+  if (unlikely(__Pyx_InitCachedConstants() < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  /*--- Global init code ---*/
+  /*--- Variable export code ---*/
+  /*--- Function export code ---*/
+  /*--- Type init code ---*/
+  /*--- Type import code ---*/
+  __pyx_ptype_7cpython_4type_type = __Pyx_ImportType(__Pyx_BUILTIN_MODULE_NAME, "type", 
+  #if CYTHON_COMPILING_IN_PYPY
+  sizeof(PyTypeObject),
+  #else
+  sizeof(PyHeapTypeObject),
+  #endif
+  0); if (unlikely(!__pyx_ptype_7cpython_4type_type)) {__pyx_filename = __pyx_f[2]; __pyx_lineno = 9; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_ptype_5numpy_dtype = __Pyx_ImportType("numpy", "dtype", sizeof(PyArray_Descr), 0); if (unlikely(!__pyx_ptype_5numpy_dtype)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 155; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_ptype_5numpy_flatiter = __Pyx_ImportType("numpy", "flatiter", sizeof(PyArrayIterObject), 0); if (unlikely(!__pyx_ptype_5numpy_flatiter)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 165; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_ptype_5numpy_broadcast = __Pyx_ImportType("numpy", "broadcast", sizeof(PyArrayMultiIterObject), 0); if (unlikely(!__pyx_ptype_5numpy_broadcast)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 169; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_ptype_5numpy_ndarray = __Pyx_ImportType("numpy", "ndarray", sizeof(PyArrayObject), 0); if (unlikely(!__pyx_ptype_5numpy_ndarray)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 178; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __pyx_ptype_5numpy_ufunc = __Pyx_ImportType("numpy", "ufunc", sizeof(PyUFuncObject), 0); if (unlikely(!__pyx_ptype_5numpy_ufunc)) {__pyx_filename = __pyx_f[1]; __pyx_lineno = 861; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  /*--- Variable import code ---*/
+  /*--- Function import code ---*/
+  /*--- Execution code ---*/
+
+  /* "nms/gpu_nms.pyx":8
+ * # --------------------------------------------------------
+ * 
+ * import numpy as np             # <<<<<<<<<<<<<<
+ * cimport numpy as np
+ * 
+ */
+  __pyx_t_1 = __Pyx_Import(__pyx_n_s_numpy, 0, -1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_np, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 8; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "nms/gpu_nms.pyx":11
+ * cimport numpy as np
+ * 
+ * assert sizeof(int) == sizeof(np.int32_t)             # <<<<<<<<<<<<<<
+ * 
+ * cdef extern from "gpu_nms.hpp":
+ */
+  #ifndef CYTHON_WITHOUT_ASSERTIONS
+  if (unlikely(!Py_OptimizeFlag)) {
+    if (unlikely(!(((sizeof(int)) == (sizeof(__pyx_t_5numpy_int32_t))) != 0))) {
+      PyErr_SetNone(PyExc_AssertionError);
+      {__pyx_filename = __pyx_f[0]; __pyx_lineno = 11; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+    }
+  }
+  #endif
+
+  /* "nms/gpu_nms.pyx":16
+ *     void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int)
+ * 
+ * def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,             # <<<<<<<<<<<<<<
+ *             np.int32_t device_id=0):
+ *     cdef int boxes_num = dets.shape[0]
+ */
+  __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_3nms_7gpu_nms_1gpu_nms, NULL, __pyx_n_s_nms_gpu_nms); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_gpu_nms, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "nms/gpu_nms.pyx":1
+ * # --------------------------------------------------------             # <<<<<<<<<<<<<<
+ * # Faster R-CNN
+ * # Copyright (c) 2015 Microsoft
+ */
+  __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_GOTREF(__pyx_t_1);
+  if (PyDict_SetItem(__pyx_d, __pyx_n_s_test, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}
+  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
+
+  /* "/home/xinleic/anaconda/lib/python2.7/site-packages/Cython/Includes/numpy/__init__.pxd":976
+ *      arr.base = baseptr
+ * 
+ * cdef inline object get_array_base(ndarray arr):             # <<<<<<<<<<<<<<
+ *     if arr.base is NULL:
+ *         return None
+ */
+  goto __pyx_L0;
+  __pyx_L1_error:;
+  __Pyx_XDECREF(__pyx_t_1);
+  if (__pyx_m) {
+    __Pyx_AddTraceback("init nms.gpu_nms", __pyx_clineno, __pyx_lineno, __pyx_filename);
+    Py_DECREF(__pyx_m); __pyx_m = 0;
+  } else if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_ImportError, "init nms.gpu_nms");
+  }
+  __pyx_L0:;
+  __Pyx_RefNannyFinishContext();
+  #if PY_MAJOR_VERSION < 3
+  return;
+  #else
+  return __pyx_m;
+  #endif
+}
+
+/* Runtime support code */
+#if CYTHON_REFNANNY
+static __Pyx_RefNannyAPIStruct *__Pyx_RefNannyImportAPI(const char *modname) {
+    PyObject *m = NULL, *p = NULL;
+    void *r = NULL;
+    m = PyImport_ImportModule((char *)modname);
+    if (!m) goto end;
+    p = PyObject_GetAttrString(m, (char *)"RefNannyAPI");
+    if (!p) goto end;
+    r = PyLong_AsVoidPtr(p);
+end:
+    Py_XDECREF(p);
+    Py_XDECREF(m);
+    return (__Pyx_RefNannyAPIStruct *)r;
+}
+#endif /* CYTHON_REFNANNY */
+
+static void __Pyx_RaiseArgtupleInvalid(
+    const char* func_name,
+    int exact,
+    Py_ssize_t num_min,
+    Py_ssize_t num_max,
+    Py_ssize_t num_found)
+{
+    Py_ssize_t num_expected;
+    const char *more_or_less;
+    if (num_found < num_min) {
+        num_expected = num_min;
+        more_or_less = "at least";
+    } else {
+        num_expected = num_max;
+        more_or_less = "at most";
+    }
+    if (exact) {
+        more_or_less = "exactly";
+    }
+    PyErr_Format(PyExc_TypeError,
+                 "%.200s() takes %.8s %" CYTHON_FORMAT_SSIZE_T "d positional argument%.1s (%" CYTHON_FORMAT_SSIZE_T "d given)",
+                 func_name, more_or_less, num_expected,
+                 (num_expected == 1) ? "" : "s", num_found);
+}
+
+static void __Pyx_RaiseDoubleKeywordsError(
+    const char* func_name,
+    PyObject* kw_name)
+{
+    PyErr_Format(PyExc_TypeError,
+        #if PY_MAJOR_VERSION >= 3
+        "%s() got multiple values for keyword argument '%U'", func_name, kw_name);
+        #else
+        "%s() got multiple values for keyword argument '%s'", func_name,
+        PyString_AsString(kw_name));
+        #endif
+}
+
+static int __Pyx_ParseOptionalKeywords(
+    PyObject *kwds,
+    PyObject **argnames[],
+    PyObject *kwds2,
+    PyObject *values[],
+    Py_ssize_t num_pos_args,
+    const char* function_name)
+{
+    PyObject *key = 0, *value = 0;
+    Py_ssize_t pos = 0;
+    PyObject*** name;
+    PyObject*** first_kw_arg = argnames + num_pos_args;
+    while (PyDict_Next(kwds, &pos, &key, &value)) {
+        name = first_kw_arg;
+        while (*name && (**name != key)) name++;
+        if (*name) {
+            values[name-argnames] = value;
+            continue;
+        }
+        name = first_kw_arg;
+        #if PY_MAJOR_VERSION < 3
+        if (likely(PyString_CheckExact(key)) || likely(PyString_Check(key))) {
+            while (*name) {
+                if ((CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**name) == PyString_GET_SIZE(key))
+                        && _PyString_Eq(**name, key)) {
+                    values[name-argnames] = value;
+                    break;
+                }
+                name++;
+            }
+            if (*name) continue;
+            else {
+                PyObject*** argname = argnames;
+                while (argname != first_kw_arg) {
+                    if ((**argname == key) || (
+                            (CYTHON_COMPILING_IN_PYPY || PyString_GET_SIZE(**argname) == PyString_GET_SIZE(key))
+                             && _PyString_Eq(**argname, key))) {
+                        goto arg_passed_twice;
+                    }
+                    argname++;
+                }
+            }
+        } else
+        #endif
+        if (likely(PyUnicode_Check(key))) {
+            while (*name) {
+                int cmp = (**name == key) ? 0 :
+                #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
+                    (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 :
+                #endif
+                    PyUnicode_Compare(**name, key);
+                if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
+                if (cmp == 0) {
+                    values[name-argnames] = value;
+                    break;
+                }
+                name++;
+            }
+            if (*name) continue;
+            else {
+                PyObject*** argname = argnames;
+                while (argname != first_kw_arg) {
+                    int cmp = (**argname == key) ? 0 :
+                    #if !CYTHON_COMPILING_IN_PYPY && PY_MAJOR_VERSION >= 3
+                        (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 :
+                    #endif
+                        PyUnicode_Compare(**argname, key);
+                    if (cmp < 0 && unlikely(PyErr_Occurred())) goto bad;
+                    if (cmp == 0) goto arg_passed_twice;
+                    argname++;
+                }
+            }
+        } else
+            goto invalid_keyword_type;
+        if (kwds2) {
+            if (unlikely(PyDict_SetItem(kwds2, key, value))) goto bad;
+        } else {
+            goto invalid_keyword;
+        }
+    }
+    return 0;
+arg_passed_twice:
+    __Pyx_RaiseDoubleKeywordsError(function_name, key);
+    goto bad;
+invalid_keyword_type:
+    PyErr_Format(PyExc_TypeError,
+        "%.200s() keywords must be strings", function_name);
+    goto bad;
+invalid_keyword:
+    PyErr_Format(PyExc_TypeError,
+    #if PY_MAJOR_VERSION < 3
+        "%.200s() got an unexpected keyword argument '%.200s'",
+        function_name, PyString_AsString(key));
+    #else
+        "%s() got an unexpected keyword argument '%U'",
+        function_name, key);
+    #endif
+bad:
+    return -1;
+}
+
+static void __Pyx_RaiseArgumentTypeInvalid(const char* name, PyObject *obj, PyTypeObject *type) {
+    PyErr_Format(PyExc_TypeError,
+        "Argument '%.200s' has incorrect type (expected %.200s, got %.200s)",
+        name, type->tp_name, Py_TYPE(obj)->tp_name);
+}
+static CYTHON_INLINE int __Pyx_ArgTypeTest(PyObject *obj, PyTypeObject *type, int none_allowed,
+    const char *name, int exact)
+{
+    if (unlikely(!type)) {
+        PyErr_SetString(PyExc_SystemError, "Missing type object");
+        return 0;
+    }
+    if (none_allowed && obj == Py_None) return 1;
+    else if (exact) {
+        if (likely(Py_TYPE(obj) == type)) return 1;
+        #if PY_MAJOR_VERSION == 2
+        else if ((type == &PyBaseString_Type) && likely(__Pyx_PyBaseString_CheckExact(obj))) return 1;
+        #endif
+    }
+    else {
+        if (likely(PyObject_TypeCheck(obj, type))) return 1;
+    }
+    __Pyx_RaiseArgumentTypeInvalid(name, obj, type);
+    return 0;
+}
+
+static CYTHON_INLINE int __Pyx_IsLittleEndian(void) {
+  unsigned int n = 1;
+  return *(unsigned char*)(&n) != 0;
+}
+static void __Pyx_BufFmt_Init(__Pyx_BufFmt_Context* ctx,
+                              __Pyx_BufFmt_StackElem* stack,
+                              __Pyx_TypeInfo* type) {
+  stack[0].field = &ctx->root;
+  stack[0].parent_offset = 0;
+  ctx->root.type = type;
+  ctx->root.name = "buffer dtype";
+  ctx->root.offset = 0;
+  ctx->head = stack;
+  ctx->head->field = &ctx->root;
+  ctx->fmt_offset = 0;
+  ctx->head->parent_offset = 0;
+  ctx->new_packmode = '@';
+  ctx->enc_packmode = '@';
+  ctx->new_count = 1;
+  ctx->enc_count = 0;
+  ctx->enc_type = 0;
+  ctx->is_complex = 0;
+  ctx->is_valid_array = 0;
+  ctx->struct_alignment = 0;
+  while (type->typegroup == 'S') {
+    ++ctx->head;
+    ctx->head->field = type->fields;
+    ctx->head->parent_offset = 0;
+    type = type->fields->type;
+  }
+}
+static int __Pyx_BufFmt_ParseNumber(const char** ts) {
+    int count;
+    const char* t = *ts;
+    if (*t < '0' || *t > '9') {
+      return -1;
+    } else {
+        count = *t++ - '0';
+        while (*t >= '0' && *t < '9') {
+            count *= 10;
+            count += *t++ - '0';
+        }
+    }
+    *ts = t;
+    return count;
+}
+static int __Pyx_BufFmt_ExpectNumber(const char **ts) {
+    int number = __Pyx_BufFmt_ParseNumber(ts);
+    if (number == -1) /* First char was not a digit */
+        PyErr_Format(PyExc_ValueError,\
+                     "Does not understand character buffer dtype format string ('%c')", **ts);
+    return number;
+}
+static void __Pyx_BufFmt_RaiseUnexpectedChar(char ch) {
+  PyErr_Format(PyExc_ValueError,
+               "Unexpected format string character: '%c'", ch);
+}
+static const char* __Pyx_BufFmt_DescribeTypeChar(char ch, int is_complex) {
+  switch (ch) {
+    case 'c': return "'char'";
+    case 'b': return "'signed char'";
+    case 'B': return "'unsigned char'";
+    case 'h': return "'short'";
+    case 'H': return "'unsigned short'";
+    case 'i': return "'int'";
+    case 'I': return "'unsigned int'";
+    case 'l': return "'long'";
+    case 'L': return "'unsigned long'";
+    case 'q': return "'long long'";
+    case 'Q': return "'unsigned long long'";
+    case 'f': return (is_complex ? "'complex float'" : "'float'");
+    case 'd': return (is_complex ? "'complex double'" : "'double'");
+    case 'g': return (is_complex ? "'complex long double'" : "'long double'");
+    case 'T': return "a struct";
+    case 'O': return "Python object";
+    case 'P': return "a pointer";
+    case 's': case 'p': return "a string";
+    case 0: return "end";
+    default: return "unparseable format string";
+  }
+}
+static size_t __Pyx_BufFmt_TypeCharToStandardSize(char ch, int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return 2;
+    case 'i': case 'I': case 'l': case 'L': return 4;
+    case 'q': case 'Q': return 8;
+    case 'f': return (is_complex ? 8 : 4);
+    case 'd': return (is_complex ? 16 : 8);
+    case 'g': {
+      PyErr_SetString(PyExc_ValueError, "Python does not define a standard format string size for long double ('g')..");
+      return 0;
+    }
+    case 'O': case 'P': return sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+static size_t __Pyx_BufFmt_TypeCharToNativeSize(char ch, int is_complex) {
+  switch (ch) {
+    case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(short);
+    case 'i': case 'I': return sizeof(int);
+    case 'l': case 'L': return sizeof(long);
+    #ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(PY_LONG_LONG);
+    #endif
+    case 'f': return sizeof(float) * (is_complex ? 2 : 1);
+    case 'd': return sizeof(double) * (is_complex ? 2 : 1);
+    case 'g': return sizeof(long double) * (is_complex ? 2 : 1);
+    case 'O': case 'P': return sizeof(void*);
+    default: {
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+  }
+}
+typedef struct { char c; short x; } __Pyx_st_short;
+typedef struct { char c; int x; } __Pyx_st_int;
+typedef struct { char c; long x; } __Pyx_st_long;
+typedef struct { char c; float x; } __Pyx_st_float;
+typedef struct { char c; double x; } __Pyx_st_double;
+typedef struct { char c; long double x; } __Pyx_st_longdouble;
+typedef struct { char c; void *x; } __Pyx_st_void_p;
+#ifdef HAVE_LONG_LONG
+typedef struct { char c; PY_LONG_LONG x; } __Pyx_st_longlong;
+#endif
+static size_t __Pyx_BufFmt_TypeCharToAlignment(char ch, CYTHON_UNUSED int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(__Pyx_st_short) - sizeof(short);
+    case 'i': case 'I': return sizeof(__Pyx_st_int) - sizeof(int);
+    case 'l': case 'L': return sizeof(__Pyx_st_long) - sizeof(long);
+#ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(__Pyx_st_longlong) - sizeof(PY_LONG_LONG);
+#endif
+    case 'f': return sizeof(__Pyx_st_float) - sizeof(float);
+    case 'd': return sizeof(__Pyx_st_double) - sizeof(double);
+    case 'g': return sizeof(__Pyx_st_longdouble) - sizeof(long double);
+    case 'P': case 'O': return sizeof(__Pyx_st_void_p) - sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+/* These are for computing the padding at the end of the struct to align
+   on the first member of the struct. This will probably the same as above,
+   but we don't have any guarantees.
+ */
+typedef struct { short x; char c; } __Pyx_pad_short;
+typedef struct { int x; char c; } __Pyx_pad_int;
+typedef struct { long x; char c; } __Pyx_pad_long;
+typedef struct { float x; char c; } __Pyx_pad_float;
+typedef struct { double x; char c; } __Pyx_pad_double;
+typedef struct { long double x; char c; } __Pyx_pad_longdouble;
+typedef struct { void *x; char c; } __Pyx_pad_void_p;
+#ifdef HAVE_LONG_LONG
+typedef struct { PY_LONG_LONG x; char c; } __Pyx_pad_longlong;
+#endif
+static size_t __Pyx_BufFmt_TypeCharToPadding(char ch, CYTHON_UNUSED int is_complex) {
+  switch (ch) {
+    case '?': case 'c': case 'b': case 'B': case 's': case 'p': return 1;
+    case 'h': case 'H': return sizeof(__Pyx_pad_short) - sizeof(short);
+    case 'i': case 'I': return sizeof(__Pyx_pad_int) - sizeof(int);
+    case 'l': case 'L': return sizeof(__Pyx_pad_long) - sizeof(long);
+#ifdef HAVE_LONG_LONG
+    case 'q': case 'Q': return sizeof(__Pyx_pad_longlong) - sizeof(PY_LONG_LONG);
+#endif
+    case 'f': return sizeof(__Pyx_pad_float) - sizeof(float);
+    case 'd': return sizeof(__Pyx_pad_double) - sizeof(double);
+    case 'g': return sizeof(__Pyx_pad_longdouble) - sizeof(long double);
+    case 'P': case 'O': return sizeof(__Pyx_pad_void_p) - sizeof(void*);
+    default:
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+}
+static char __Pyx_BufFmt_TypeCharToGroup(char ch, int is_complex) {
+  switch (ch) {
+    case 'c':
+        return 'H';
+    case 'b': case 'h': case 'i':
+    case 'l': case 'q': case 's': case 'p':
+        return 'I';
+    case 'B': case 'H': case 'I': case 'L': case 'Q':
+        return 'U';
+    case 'f': case 'd': case 'g':
+        return (is_complex ? 'C' : 'R');
+    case 'O':
+        return 'O';
+    case 'P':
+        return 'P';
+    default: {
+      __Pyx_BufFmt_RaiseUnexpectedChar(ch);
+      return 0;
+    }
+  }
+}
+static void __Pyx_BufFmt_RaiseExpected(__Pyx_BufFmt_Context* ctx) {
+  if (ctx->head == NULL || ctx->head->field == &ctx->root) {
+    const char* expected;
+    const char* quote;
+    if (ctx->head == NULL) {
+      expected = "end";
+      quote = "";
+    } else {
+      expected = ctx->head->field->type->name;
+      quote = "'";
+    }
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer dtype mismatch, expected %s%s%s but got %s",
+                 quote, expected, quote,
+                 __Pyx_BufFmt_DescribeTypeChar(ctx->enc_type, ctx->is_complex));
+  } else {
+    __Pyx_StructField* field = ctx->head->field;
+    __Pyx_StructField* parent = (ctx->head - 1)->field;
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer dtype mismatch, expected '%s' but got %s in '%s.%s'",
+                 field->type->name, __Pyx_BufFmt_DescribeTypeChar(ctx->enc_type, ctx->is_complex),
+                 parent->type->name, field->name);
+  }
+}
+static int __Pyx_BufFmt_ProcessTypeChunk(__Pyx_BufFmt_Context* ctx) {
+  char group;
+  size_t size, offset, arraysize = 1;
+  if (ctx->enc_type == 0) return 0;
+  if (ctx->head->field->type->arraysize[0]) {
+    int i, ndim = 0;
+    if (ctx->enc_type == 's' || ctx->enc_type == 'p') {
+        ctx->is_valid_array = ctx->head->field->type->ndim == 1;
+        ndim = 1;
+        if (ctx->enc_count != ctx->head->field->type->arraysize[0]) {
+            PyErr_Format(PyExc_ValueError,
+                         "Expected a dimension of size %zu, got %zu",
+                         ctx->head->field->type->arraysize[0], ctx->enc_count);
+            return -1;
+        }
+    }
+    if (!ctx->is_valid_array) {
+      PyErr_Format(PyExc_ValueError, "Expected %d dimensions, got %d",
+                   ctx->head->field->type->ndim, ndim);
+      return -1;
+    }
+    for (i = 0; i < ctx->head->field->type->ndim; i++) {
+      arraysize *= ctx->head->field->type->arraysize[i];
+    }
+    ctx->is_valid_array = 0;
+    ctx->enc_count = 1;
+  }
+  group = __Pyx_BufFmt_TypeCharToGroup(ctx->enc_type, ctx->is_complex);
+  do {
+    __Pyx_StructField* field = ctx->head->field;
+    __Pyx_TypeInfo* type = field->type;
+    if (ctx->enc_packmode == '@' || ctx->enc_packmode == '^') {
+      size = __Pyx_BufFmt_TypeCharToNativeSize(ctx->enc_type, ctx->is_complex);
+    } else {
+      size = __Pyx_BufFmt_TypeCharToStandardSize(ctx->enc_type, ctx->is_complex);
+    }
+    if (ctx->enc_packmode == '@') {
+      size_t align_at = __Pyx_BufFmt_TypeCharToAlignment(ctx->enc_type, ctx->is_complex);
+      size_t align_mod_offset;
+      if (align_at == 0) return -1;
+      align_mod_offset = ctx->fmt_offset % align_at;
+      if (align_mod_offset > 0) ctx->fmt_offset += align_at - align_mod_offset;
+      if (ctx->struct_alignment == 0)
+          ctx->struct_alignment = __Pyx_BufFmt_TypeCharToPadding(ctx->enc_type,
+                                                                 ctx->is_complex);
+    }
+    if (type->size != size || type->typegroup != group) {
+      if (type->typegroup == 'C' && type->fields != NULL) {
+        size_t parent_offset = ctx->head->parent_offset + field->offset;
+        ++ctx->head;
+        ctx->head->field = type->fields;
+        ctx->head->parent_offset = parent_offset;
+        continue;
+      }
+      if ((type->typegroup == 'H' || group == 'H') && type->size == size) {
+      } else {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return -1;
+      }
+    }
+    offset = ctx->head->parent_offset + field->offset;
+    if (ctx->fmt_offset != offset) {
+      PyErr_Format(PyExc_ValueError,
+                   "Buffer dtype mismatch; next field is at offset %" CYTHON_FORMAT_SSIZE_T "d but %" CYTHON_FORMAT_SSIZE_T "d expected",
+                   (Py_ssize_t)ctx->fmt_offset, (Py_ssize_t)offset);
+      return -1;
+    }
+    ctx->fmt_offset += size;
+    if (arraysize)
+      ctx->fmt_offset += (arraysize - 1) * size;
+    --ctx->enc_count; /* Consume from buffer string */
+    while (1) {
+      if (field == &ctx->root) {
+        ctx->head = NULL;
+        if (ctx->enc_count != 0) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return -1;
+        }
+        break; /* breaks both loops as ctx->enc_count == 0 */
+      }
+      ctx->head->field = ++field;
+      if (field->type == NULL) {
+        --ctx->head;
+        field = ctx->head->field;
+        continue;
+      } else if (field->type->typegroup == 'S') {
+        size_t parent_offset = ctx->head->parent_offset + field->offset;
+        if (field->type->fields->type == NULL) continue; /* empty struct */
+        field = field->type->fields;
+        ++ctx->head;
+        ctx->head->field = field;
+        ctx->head->parent_offset = parent_offset;
+        break;
+      } else {
+        break;
+      }
+    }
+  } while (ctx->enc_count);
+  ctx->enc_type = 0;
+  ctx->is_complex = 0;
+  return 0;
+}
+static CYTHON_INLINE PyObject *
+__pyx_buffmt_parse_array(__Pyx_BufFmt_Context* ctx, const char** tsp)
+{
+    const char *ts = *tsp;
+    int i = 0, number;
+    int ndim = ctx->head->field->type->ndim;
+;
+    ++ts;
+    if (ctx->new_count != 1) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Cannot handle repeated arrays in format string");
+        return NULL;
+    }
+    if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+    while (*ts && *ts != ')') {
+        switch (*ts) {
+            case ' ': case '\f': case '\r': case '\n': case '\t': case '\v':  continue;
+            default:  break;  /* not a 'break' in the loop */
+        }
+        number = __Pyx_BufFmt_ExpectNumber(&ts);
+        if (number == -1) return NULL;
+        if (i < ndim && (size_t) number != ctx->head->field->type->arraysize[i])
+            return PyErr_Format(PyExc_ValueError,
+                        "Expected a dimension of size %zu, got %d",
+                        ctx->head->field->type->arraysize[i], number);
+        if (*ts != ',' && *ts != ')')
+            return PyErr_Format(PyExc_ValueError,
+                                "Expected a comma in format string, got '%c'", *ts);
+        if (*ts == ',') ts++;
+        i++;
+    }
+    if (i != ndim)
+        return PyErr_Format(PyExc_ValueError, "Expected %d dimension(s), got %d",
+                            ctx->head->field->type->ndim, i);
+    if (!*ts) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Unexpected end of format string, expected ')'");
+        return NULL;
+    }
+    ctx->is_valid_array = 1;
+    ctx->new_count = 1;
+    *tsp = ++ts;
+    return Py_None;
+}
+static const char* __Pyx_BufFmt_CheckString(__Pyx_BufFmt_Context* ctx, const char* ts) {
+  int got_Z = 0;
+  while (1) {
+    switch(*ts) {
+      case 0:
+        if (ctx->enc_type != 0 && ctx->head == NULL) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return NULL;
+        }
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        if (ctx->head != NULL) {
+          __Pyx_BufFmt_RaiseExpected(ctx);
+          return NULL;
+        }
+                return ts;
+      case ' ':
+      case 10:
+      case 13:
+        ++ts;
+        break;
+      case '<':
+        if (!__Pyx_IsLittleEndian()) {
+          PyErr_SetString(PyExc_ValueError, "Little-endian buffer not supported on big-endian compiler");
+          return NULL;
+        }
+        ctx->new_packmode = '=';
+        ++ts;
+        break;
+      case '>':
+      case '!':
+        if (__Pyx_IsLittleEndian()) {
+          PyErr_SetString(PyExc_ValueError, "Big-endian buffer not supported on little-endian compiler");
+          return NULL;
+        }
+        ctx->new_packmode = '=';
+        ++ts;
+        break;
+      case '=':
+      case '@':
+      case '^':
+        ctx->new_packmode = *ts++;
+        break;
+      case 'T': /* substruct */
+        {
+          const char* ts_after_sub;
+          size_t i, struct_count = ctx->new_count;
+          size_t struct_alignment = ctx->struct_alignment;
+          ctx->new_count = 1;
+          ++ts;
+          if (*ts != '{') {
+            PyErr_SetString(PyExc_ValueError, "Buffer acquisition: Expected '{' after 'T'");
+            return NULL;
+          }
+          if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+          ctx->enc_type = 0; /* Erase processed last struct element */
+          ctx->enc_count = 0;
+          ctx->struct_alignment = 0;
+          ++ts;
+          ts_after_sub = ts;
+          for (i = 0; i != struct_count; ++i) {
+            ts_after_sub = __Pyx_BufFmt_CheckString(ctx, ts);
+            if (!ts_after_sub) return NULL;
+          }
+          ts = ts_after_sub;
+          if (struct_alignment) ctx->struct_alignment = struct_alignment;
+        }
+        break;
+      case '}': /* end of substruct; either repeat or move on */
+        {
+          size_t alignment = ctx->struct_alignment;
+          ++ts;
+          if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+          ctx->enc_type = 0; /* Erase processed last struct element */
+          if (alignment && ctx->fmt_offset % alignment) {
+            ctx->fmt_offset += alignment - (ctx->fmt_offset % alignment);
+          }
+        }
+        return ts;
+      case 'x':
+        if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+        ctx->fmt_offset += ctx->new_count;
+        ctx->new_count = 1;
+        ctx->enc_count = 0;
+        ctx->enc_type = 0;
+        ctx->enc_packmode = ctx->new_packmode;
+        ++ts;
+        break;
+      case 'Z':
+        got_Z = 1;
+        ++ts;
+        if (*ts != 'f' && *ts != 'd' && *ts != 'g') {
+          __Pyx_BufFmt_RaiseUnexpectedChar('Z');
+          return NULL;
+        }        /* fall through */
+      case 'c': case 'b': case 'B': case 'h': case 'H': case 'i': case 'I':
+      case 'l': case 'L': case 'q': case 'Q':
+      case 'f': case 'd': case 'g':
+      case 'O': case 's': case 'p':
+        if (ctx->enc_type == *ts && got_Z == ctx->is_complex &&
+            ctx->enc_packmode == ctx->new_packmode) {
+          ctx->enc_count += ctx->new_count;
+        } else {
+          if (__Pyx_BufFmt_ProcessTypeChunk(ctx) == -1) return NULL;
+          ctx->enc_count = ctx->new_count;
+          ctx->enc_packmode = ctx->new_packmode;
+          ctx->enc_type = *ts;
+          ctx->is_complex = got_Z;
+        }
+        ++ts;
+        ctx->new_count = 1;
+        got_Z = 0;
+        break;
+      case ':':
+        ++ts;
+        while(*ts != ':') ++ts;
+        ++ts;
+        break;
+      case '(':
+        if (!__pyx_buffmt_parse_array(ctx, &ts)) return NULL;
+        break;
+      default:
+        {
+          int number = __Pyx_BufFmt_ExpectNumber(&ts);
+          if (number == -1) return NULL;
+          ctx->new_count = (size_t)number;
+        }
+    }
+  }
+}
+static CYTHON_INLINE void __Pyx_ZeroBuffer(Py_buffer* buf) {
+  buf->buf = NULL;
+  buf->obj = NULL;
+  buf->strides = __Pyx_zeros;
+  buf->shape = __Pyx_zeros;
+  buf->suboffsets = __Pyx_minusones;
+}
+static CYTHON_INLINE int __Pyx_GetBufferAndValidate(
+        Py_buffer* buf, PyObject* obj,  __Pyx_TypeInfo* dtype, int flags,
+        int nd, int cast, __Pyx_BufFmt_StackElem* stack)
+{
+  if (obj == Py_None || obj == NULL) {
+    __Pyx_ZeroBuffer(buf);
+    return 0;
+  }
+  buf->buf = NULL;
+  if (__Pyx_GetBuffer(obj, buf, flags) == -1) goto fail;
+  if (buf->ndim != nd) {
+    PyErr_Format(PyExc_ValueError,
+                 "Buffer has wrong number of dimensions (expected %d, got %d)",
+                 nd, buf->ndim);
+    goto fail;
+  }
+  if (!cast) {
+    __Pyx_BufFmt_Context ctx;
+    __Pyx_BufFmt_Init(&ctx, stack, dtype);
+    if (!__Pyx_BufFmt_CheckString(&ctx, buf->format)) goto fail;
+  }
+  if ((unsigned)buf->itemsize != dtype->size) {
+    PyErr_Format(PyExc_ValueError,
+      "Item size of buffer (%" CYTHON_FORMAT_SSIZE_T "d byte%s) does not match size of '%s' (%" CYTHON_FORMAT_SSIZE_T "d byte%s)",
+      buf->itemsize, (buf->itemsize > 1) ? "s" : "",
+      dtype->name, (Py_ssize_t)dtype->size, (dtype->size > 1) ? "s" : "");
+    goto fail;
+  }
+  if (buf->suboffsets == NULL) buf->suboffsets = __Pyx_minusones;
+  return 0;
+fail:;
+  __Pyx_ZeroBuffer(buf);
+  return -1;
+}
+static CYTHON_INLINE void __Pyx_SafeReleaseBuffer(Py_buffer* info) {
+  if (info->buf == NULL) return;
+  if (info->suboffsets == __Pyx_minusones) info->suboffsets = NULL;
+  __Pyx_ReleaseBuffer(info);
+}
+
+static PyObject *__Pyx_GetBuiltinName(PyObject *name) {
+    PyObject* result = __Pyx_PyObject_GetAttrStr(__pyx_b, name);
+    if (unlikely(!result)) {
+        PyErr_Format(PyExc_NameError,
+#if PY_MAJOR_VERSION >= 3
+            "name '%U' is not defined", name);
+#else
+            "name '%.200s' is not defined", PyString_AS_STRING(name));
+#endif
+    }
+    return result;
+}
+
+static CYTHON_INLINE PyObject *__Pyx_GetModuleGlobalName(PyObject *name) {
+    PyObject *result;
+#if CYTHON_COMPILING_IN_CPYTHON
+    result = PyDict_GetItem(__pyx_d, name);
+    if (result) {
+        Py_INCREF(result);
+    } else {
+#else
+    result = PyObject_GetItem(__pyx_d, name);
+    if (!result) {
+        PyErr_Clear();
+#endif
+        result = __Pyx_GetBuiltinName(name);
+    }
+    return result;
+}
+
+#if CYTHON_COMPILING_IN_CPYTHON
+static CYTHON_INLINE PyObject* __Pyx_PyObject_Call(PyObject *func, PyObject *arg, PyObject *kw) {
+    PyObject *result;
+    ternaryfunc call = func->ob_type->tp_call;
+    if (unlikely(!call))
+        return PyObject_Call(func, arg, kw);
+#if PY_VERSION_HEX >= 0x02060000
+    if (unlikely(Py_EnterRecursiveCall((char*)" while calling a Python object")))
+        return NULL;
+#endif
+    result = (*call)(func, arg, kw);
+#if PY_VERSION_HEX >= 0x02060000
+    Py_LeaveRecursiveCall();
+#endif
+    if (unlikely(!result) && unlikely(!PyErr_Occurred())) {
+        PyErr_SetString(
+            PyExc_SystemError,
+            "NULL result without error in PyObject_Call");
+    }
+    return result;
+}
+#endif
+
+static CYTHON_INLINE int __Pyx_TypeTest(PyObject *obj, PyTypeObject *type) {
+    if (unlikely(!type)) {
+        PyErr_SetString(PyExc_SystemError, "Missing type object");
+        return 0;
+    }
+    if (likely(PyObject_TypeCheck(obj, type)))
+        return 1;
+    PyErr_Format(PyExc_TypeError, "Cannot convert %.200s to %.200s",
+                 Py_TYPE(obj)->tp_name, type->tp_name);
+    return 0;
+}
+
+static void __Pyx_RaiseBufferIndexError(int axis) {
+  PyErr_Format(PyExc_IndexError,
+     "Out of bounds on buffer access (axis %d)", axis);
+}
+
+static CYTHON_INLINE PyObject* __Pyx_PyObject_GetSlice(
+        PyObject* obj, Py_ssize_t cstart, Py_ssize_t cstop,
+        PyObject** _py_start, PyObject** _py_stop, PyObject** _py_slice,
+        int has_cstart, int has_cstop, CYTHON_UNUSED int wraparound) {
+#if CYTHON_COMPILING_IN_CPYTHON
+    PyMappingMethods* mp;
+#if PY_MAJOR_VERSION < 3
+    PySequenceMethods* ms = Py_TYPE(obj)->tp_as_sequence;
+    if (likely(ms && ms->sq_slice)) {
+        if (!has_cstart) {
+            if (_py_start && (*_py_start != Py_None)) {
+                cstart = __Pyx_PyIndex_AsSsize_t(*_py_start);
+                if ((cstart == (Py_ssize_t)-1) && PyErr_Occurred()) goto bad;
+            } else
+                cstart = 0;
+        }
+        if (!has_cstop) {
+            if (_py_stop && (*_py_stop != Py_None)) {
+                cstop = __Pyx_PyIndex_AsSsize_t(*_py_stop);
+                if ((cstop == (Py_ssize_t)-1) && PyErr_Occurred()) goto bad;
+            } else
+                cstop = PY_SSIZE_T_MAX;
+        }
+        if (wraparound && unlikely((cstart < 0) | (cstop < 0)) && likely(ms->sq_length)) {
+            Py_ssize_t l = ms->sq_length(obj);
+            if (likely(l >= 0)) {
+                if (cstop < 0) {
+                    cstop += l;
+                    if (cstop < 0) cstop = 0;
+                }
+                if (cstart < 0) {
+                    cstart += l;
+                    if (cstart < 0) cstart = 0;
+                }
+            } else {
+                if (PyErr_ExceptionMatches(PyExc_OverflowError))
+                    PyErr_Clear();
+                else
+                    goto bad;
+            }
+        }
+        return ms->sq_slice(obj, cstart, cstop);
+    }
+#endif
+    mp = Py_TYPE(obj)->tp_as_mapping;
+    if (likely(mp && mp->mp_subscript))
+#endif
+    {
+        PyObject* result;
+        PyObject *py_slice, *py_start, *py_stop;
+        if (_py_slice) {
+            py_slice = *_py_slice;
+        } else {
+            PyObject* owned_start = NULL;
+            PyObject* owned_stop = NULL;
+            if (_py_start) {
+                py_start = *_py_start;
+            } else {
+                if (has_cstart) {
+                    owned_start = py_start = PyInt_FromSsize_t(cstart);
+                    if (unlikely(!py_start)) goto bad;
+                } else
+                    py_start = Py_None;
+            }
+            if (_py_stop) {
+                py_stop = *_py_stop;
+            } else {
+                if (has_cstop) {
+                    owned_stop = py_stop = PyInt_FromSsize_t(cstop);
+                    if (unlikely(!py_stop)) {
+                        Py_XDECREF(owned_start);
+                        goto bad;
+                    }
+                } else
+                    py_stop = Py_None;
+            }
+            py_slice = PySlice_New(py_start, py_stop, Py_None);
+            Py_XDECREF(owned_start);
+            Py_XDECREF(owned_stop);
+            if (unlikely(!py_slice)) goto bad;
+        }
+#if CYTHON_COMPILING_IN_CPYTHON
+        result = mp->mp_subscript(obj, py_slice);
+#else
+        result = PyObject_GetItem(obj, py_slice);
+#endif
+        if (!_py_slice) {
+            Py_DECREF(py_slice);
+        }
+        return result;
+    }
+    PyErr_Format(PyExc_TypeError,
+        "'%.200s' object is unsliceable", Py_TYPE(obj)->tp_name);
+bad:
+    return NULL;
+}
+
+static void __Pyx_RaiseBufferFallbackError(void) {
+  PyErr_SetString(PyExc_ValueError,
+     "Buffer acquisition failed on assignment; and then reacquiring the old buffer failed too!");
+}
+
+static CYTHON_INLINE void __Pyx_ErrRestore(PyObject *type, PyObject *value, PyObject *tb) {
+#if CYTHON_COMPILING_IN_CPYTHON
+    PyObject *tmp_type, *tmp_value, *tmp_tb;
+    PyThreadState *tstate = PyThreadState_GET();
+    tmp_type = tstate->curexc_type;
+    tmp_value = tstate->curexc_value;
+    tmp_tb = tstate->curexc_traceback;
+    tstate->curexc_type = type;
+    tstate->curexc_value = value;
+    tstate->curexc_traceback = tb;
+    Py_XDECREF(tmp_type);
+    Py_XDECREF(tmp_value);
+    Py_XDECREF(tmp_tb);
+#else
+    PyErr_Restore(type, value, tb);
+#endif
+}
+static CYTHON_INLINE void __Pyx_ErrFetch(PyObject **type, PyObject **value, PyObject **tb) {
+#if CYTHON_COMPILING_IN_CPYTHON
+    PyThreadState *tstate = PyThreadState_GET();
+    *type = tstate->curexc_type;
+    *value = tstate->curexc_value;
+    *tb = tstate->curexc_traceback;
+    tstate->curexc_type = 0;
+    tstate->curexc_value = 0;
+    tstate->curexc_traceback = 0;
+#else
+    PyErr_Fetch(type, value, tb);
+#endif
+}
+
+#if PY_MAJOR_VERSION < 3
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb,
+                        CYTHON_UNUSED PyObject *cause) {
+    Py_XINCREF(type);
+    if (!value || value == Py_None)
+        value = NULL;
+    else
+        Py_INCREF(value);
+    if (!tb || tb == Py_None)
+        tb = NULL;
+    else {
+        Py_INCREF(tb);
+        if (!PyTraceBack_Check(tb)) {
+            PyErr_SetString(PyExc_TypeError,
+                "raise: arg 3 must be a traceback or None");
+            goto raise_error;
+        }
+    }
+    #if PY_VERSION_HEX < 0x02050000
+    if (PyClass_Check(type)) {
+    #else
+    if (PyType_Check(type)) {
+    #endif
+#if CYTHON_COMPILING_IN_PYPY
+        if (!value) {
+            Py_INCREF(Py_None);
+            value = Py_None;
+        }
+#endif
+        PyErr_NormalizeException(&type, &value, &tb);
+    } else {
+        if (value) {
+            PyErr_SetString(PyExc_TypeError,
+                "instance exception may not have a separate value");
+            goto raise_error;
+        }
+        value = type;
+        #if PY_VERSION_HEX < 0x02050000
+        if (PyInstance_Check(type)) {
+            type = (PyObject*) ((PyInstanceObject*)type)->in_class;
+            Py_INCREF(type);
+        } else {
+            type = 0;
+            PyErr_SetString(PyExc_TypeError,
+                "raise: exception must be an old-style class or instance");
+            goto raise_error;
+        }
+        #else
+        type = (PyObject*) Py_TYPE(type);
+        Py_INCREF(type);
+        if (!PyType_IsSubtype((PyTypeObject *)type, (PyTypeObject *)PyExc_BaseException)) {
+            PyErr_SetString(PyExc_TypeError,
+                "raise: exception class must be a subclass of BaseException");
+            goto raise_error;
+        }
+        #endif
+    }
+    __Pyx_ErrRestore(type, value, tb);
+    return;
+raise_error:
+    Py_XDECREF(value);
+    Py_XDECREF(type);
+    Py_XDECREF(tb);
+    return;
+}
+#else /* Python 3+ */
+static void __Pyx_Raise(PyObject *type, PyObject *value, PyObject *tb, PyObject *cause) {
+    PyObject* owned_instance = NULL;
+    if (tb == Py_None) {
+        tb = 0;
+    } else if (tb && !PyTraceBack_Check(tb)) {
+        PyErr_SetString(PyExc_TypeError,
+            "raise: arg 3 must be a traceback or None");
+        goto bad;
+    }
+    if (value == Py_None)
+        value = 0;
+    if (PyExceptionInstance_Check(type)) {
+        if (value) {
+            PyErr_SetString(PyExc_TypeError,
+                "instance exception may not have a separate value");
+            goto bad;
+        }
+        value = type;
+        type = (PyObject*) Py_TYPE(value);
+    } else if (PyExceptionClass_Check(type)) {
+        PyObject *instance_class = NULL;
+        if (value && PyExceptionInstance_Check(value)) {
+            instance_class = (PyObject*) Py_TYPE(value);
+            if (instance_class != type) {
+                if (PyObject_IsSubclass(instance_class, type)) {
+                    type = instance_class;
+                } else {
+                    instance_class = NULL;
+                }
+            }
+        }
+        if (!instance_class) {
+            PyObject *args;
+            if (!value)
+                args = PyTuple_New(0);
+            else if (PyTuple_Check(value)) {
+                Py_INCREF(value);
+                args = value;
+            } else
+                args = PyTuple_Pack(1, value);
+            if (!args)
+                goto bad;
+            owned_instance = PyObject_Call(type, args, NULL);
+            Py_DECREF(args);
+            if (!owned_instance)
+                goto bad;
+            value = owned_instance;
+            if (!PyExceptionInstance_Check(value)) {
+                PyErr_Format(PyExc_TypeError,
+                             "calling %R should have returned an instance of "
+                             "BaseException, not %R",
+                             type, Py_TYPE(value));
+                goto bad;
+            }
+        }
+    } else {
+        PyErr_SetString(PyExc_TypeError,
+            "raise: exception class must be a subclass of BaseException");
+        goto bad;
+    }
+#if PY_VERSION_HEX >= 0x03030000
+    if (cause) {
+#else
+    if (cause && cause != Py_None) {
+#endif
+        PyObject *fixed_cause;
+        if (cause == Py_None) {
+            fixed_cause = NULL;
+        } else if (PyExceptionClass_Check(cause)) {
+            fixed_cause = PyObject_CallObject(cause, NULL);
+            if (fixed_cause == NULL)
+                goto bad;
+        } else if (PyExceptionInstance_Check(cause)) {
+            fixed_cause = cause;
+            Py_INCREF(fixed_cause);
+        } else {
+            PyErr_SetString(PyExc_TypeError,
+                            "exception causes must derive from "
+                            "BaseException");
+            goto bad;
+        }
+        PyException_SetCause(value, fixed_cause);
+    }
+    PyErr_SetObject(type, value);
+    if (tb) {
+        PyThreadState *tstate = PyThreadState_GET();
+        PyObject* tmp_tb = tstate->curexc_traceback;
+        if (tb != tmp_tb) {
+            Py_INCREF(tb);
+            tstate->curexc_traceback = tb;
+            Py_XDECREF(tmp_tb);
+        }
+    }
+bad:
+    Py_XDECREF(owned_instance);
+    return;
+}
+#endif
+
+static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected) {
+    PyErr_Format(PyExc_ValueError,
+                 "too many values to unpack (expected %" CYTHON_FORMAT_SSIZE_T "d)", expected);
+}
+
+static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index) {
+    PyErr_Format(PyExc_ValueError,
+                 "need more than %" CYTHON_FORMAT_SSIZE_T "d value%.1s to unpack",
+                 index, (index == 1) ? "" : "s");
+}
+
+static CYTHON_INLINE void __Pyx_RaiseNoneNotIterableError(void) {
+    PyErr_SetString(PyExc_TypeError, "'NoneType' object is not iterable");
+}
+
+#if PY_MAJOR_VERSION < 3
+static int __Pyx_GetBuffer(PyObject *obj, Py_buffer *view, int flags) {
+  #if PY_VERSION_HEX >= 0x02060000
+    if (PyObject_CheckBuffer(obj)) return PyObject_GetBuffer(obj, view, flags);
+  #endif
+        if (PyObject_TypeCheck(obj, __pyx_ptype_5numpy_ndarray)) return __pyx_pw_5numpy_7ndarray_1__getbuffer__(obj, view, flags);
+  #if PY_VERSION_HEX < 0x02060000
+    if (obj->ob_type->tp_dict) {
+        PyObject *getbuffer_cobj = PyObject_GetItem(
+            obj->ob_type->tp_dict, __pyx_n_s_pyx_getbuffer);
+        if (getbuffer_cobj) {
+            getbufferproc func = (getbufferproc) PyCObject_AsVoidPtr(getbuffer_cobj);
+            Py_DECREF(getbuffer_cobj);
+            if (!func)
+                goto fail;
+            return func(obj, view, flags);
+        } else {
+            PyErr_Clear();
+        }
+    }
+  #endif
+    PyErr_Format(PyExc_TypeError, "'%.200s' does not have the buffer interface", Py_TYPE(obj)->tp_name);
+#if PY_VERSION_HEX < 0x02060000
+fail:
+#endif
+    return -1;
+}
+static void __Pyx_ReleaseBuffer(Py_buffer *view) {
+    PyObject *obj = view->obj;
+    if (!obj) return;
+  #if PY_VERSION_HEX >= 0x02060000
+    if (PyObject_CheckBuffer(obj)) {
+        PyBuffer_Release(view);
+        return;
+    }
+  #endif
+        if (PyObject_TypeCheck(obj, __pyx_ptype_5numpy_ndarray)) { __pyx_pw_5numpy_7ndarray_3__releasebuffer__(obj, view); return; }
+  #if PY_VERSION_HEX < 0x02060000
+    if (obj->ob_type->tp_dict) {
+        PyObject *releasebuffer_cobj = PyObject_GetItem(
+            obj->ob_type->tp_dict, __pyx_n_s_pyx_releasebuffer);
+        if (releasebuffer_cobj) {
+            releasebufferproc func = (releasebufferproc) PyCObject_AsVoidPtr(releasebuffer_cobj);
+            Py_DECREF(releasebuffer_cobj);
+            if (!func)
+                goto fail;
+            func(obj, view);
+            return;
+        } else {
+            PyErr_Clear();
+        }
+    }
+  #endif
+    goto nofail;
+#if PY_VERSION_HEX < 0x02060000
+fail:
+#endif
+    PyErr_WriteUnraisable(obj);
+nofail:
+    Py_DECREF(obj);
+    view->obj = NULL;
+}
+#endif /*  PY_MAJOR_VERSION < 3 */
+
+
+        static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level) {
+    PyObject *empty_list = 0;
+    PyObject *module = 0;
+    PyObject *global_dict = 0;
+    PyObject *empty_dict = 0;
+    PyObject *list;
+    #if PY_VERSION_HEX < 0x03030000
+    PyObject *py_import;
+    py_import = __Pyx_PyObject_GetAttrStr(__pyx_b, __pyx_n_s_import);
+    if (!py_import)
+        goto bad;
+    #endif
+    if (from_list)
+        list = from_list;
+    else {
+        empty_list = PyList_New(0);
+        if (!empty_list)
+            goto bad;
+        list = empty_list;
+    }
+    global_dict = PyModule_GetDict(__pyx_m);
+    if (!global_dict)
+        goto bad;
+    empty_dict = PyDict_New();
+    if (!empty_dict)
+        goto bad;
+    #if PY_VERSION_HEX >= 0x02050000
+    {
+        #if PY_MAJOR_VERSION >= 3
+        if (level == -1) {
+            if (strchr(__Pyx_MODULE_NAME, '.')) {
+                #if PY_VERSION_HEX < 0x03030000
+                PyObject *py_level = PyInt_FromLong(1);
+                if (!py_level)
+                    goto bad;
+                module = PyObject_CallFunctionObjArgs(py_import,
+                    name, global_dict, empty_dict, list, py_level, NULL);
+                Py_DECREF(py_level);
+                #else
+                module = PyImport_ImportModuleLevelObject(
+                    name, global_dict, empty_dict, list, 1);
+                #endif
+                if (!module) {
+                    if (!PyErr_ExceptionMatches(PyExc_ImportError))
+                        goto bad;
+                    PyErr_Clear();
+                }
+            }
+            level = 0; /* try absolute import on failure */
+        }
+        #endif
+        if (!module) {
+            #if PY_VERSION_HEX < 0x03030000
+            PyObject *py_level = PyInt_FromLong(level);
+            if (!py_level)
+                goto bad;
+            module = PyObject_CallFunctionObjArgs(py_import,
+                name, global_dict, empty_dict, list, py_level, NULL);
+            Py_DECREF(py_level);
+            #else
+            module = PyImport_ImportModuleLevelObject(
+                name, global_dict, empty_dict, list, level);
+            #endif
+        }
+    }
+    #else
+    if (level>0) {
+        PyErr_SetString(PyExc_RuntimeError, "Relative import is not supported for Python <=2.4.");
+        goto bad;
+    }
+    module = PyObject_CallFunctionObjArgs(py_import,
+        name, global_dict, empty_dict, list, NULL);
+    #endif
+bad:
+    #if PY_VERSION_HEX < 0x03030000
+    Py_XDECREF(py_import);
+    #endif
+    Py_XDECREF(empty_list);
+    Py_XDECREF(empty_dict);
+    return module;
+}
+
+#define __PYX_VERIFY_RETURN_INT(target_type, func_type, func)             \
+    {                                                                     \
+        func_type value = func(x);                                        \
+        if (sizeof(target_type) < sizeof(func_type)) {                    \
+            if (unlikely(value != (func_type) (target_type) value)) {     \
+                func_type zero = 0;                                       \
+                PyErr_SetString(PyExc_OverflowError,                      \
+                    (is_unsigned && unlikely(value < zero)) ?             \
+                    "can't convert negative value to " #target_type :     \
+                    "value too large to convert to " #target_type);       \
+                return (target_type) -1;                                  \
+            }                                                             \
+        }                                                                 \
+        return (target_type) value;                                       \
+    }
+
+#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
+ #if CYTHON_USE_PYLONG_INTERNALS
+  #include "longintrepr.h"
+ #endif
+#endif
+static CYTHON_INLINE npy_int32 __Pyx_PyInt_As_npy_int32(PyObject *x) {
+    const npy_int32 neg_one = (npy_int32) -1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(npy_int32) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(npy_int32, long, PyInt_AS_LONG)
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                PyErr_SetString(PyExc_OverflowError,
+                                "can't convert negative value to npy_int32");
+                return (npy_int32) -1;
+            }
+            return (npy_int32) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
+ #if CYTHON_USE_PYLONG_INTERNALS
+            if (sizeof(digit) <= sizeof(npy_int32)) {
+                switch (Py_SIZE(x)) {
+                    case  0: return 0;
+                    case  1: return (npy_int32) ((PyLongObject*)x)->ob_digit[0];
+                }
+            }
+ #endif
+#endif
+            if (unlikely(Py_SIZE(x) < 0)) {
+                PyErr_SetString(PyExc_OverflowError,
+                                "can't convert negative value to npy_int32");
+                return (npy_int32) -1;
+            }
+            if (sizeof(npy_int32) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT(npy_int32, unsigned long, PyLong_AsUnsignedLong)
+            } else if (sizeof(npy_int32) <= sizeof(unsigned long long)) {
+                __PYX_VERIFY_RETURN_INT(npy_int32, unsigned long long, PyLong_AsUnsignedLongLong)
+            }
+        } else {
+#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
+ #if CYTHON_USE_PYLONG_INTERNALS
+            if (sizeof(digit) <= sizeof(npy_int32)) {
+                switch (Py_SIZE(x)) {
+                    case  0: return 0;
+                    case  1: return +(npy_int32) ((PyLongObject*)x)->ob_digit[0];
+                    case -1: return -(npy_int32) ((PyLongObject*)x)->ob_digit[0];
+                }
+            }
+ #endif
+#endif
+            if (sizeof(npy_int32) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT(npy_int32, long, PyLong_AsLong)
+            } else if (sizeof(npy_int32) <= sizeof(long long)) {
+                __PYX_VERIFY_RETURN_INT(npy_int32, long long, PyLong_AsLongLong)
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            npy_int32 val;
+            PyObject *v = __Pyx_PyNumber_Int(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (npy_int32) -1;
+        }
+    } else {
+        npy_int32 val;
+        PyObject *tmp = __Pyx_PyNumber_Int(x);
+        if (!tmp) return (npy_int32) -1;
+        val = __Pyx_PyInt_As_npy_int32(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+}
+
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_int(int value) {
+    const int neg_one = (int) -1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(int) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(int) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+        } else if (sizeof(int) <= sizeof(unsigned long long)) {
+            return PyLong_FromUnsignedLongLong((unsigned long long) value);
+        }
+    } else {
+        if (sizeof(int) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(int) <= sizeof(long long)) {
+            return PyLong_FromLongLong((long long) value);
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(int),
+                                     little, !is_unsigned);
+    }
+}
+
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      return ::std::complex< float >(x, y);
+    }
+  #else
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      return x + y*(__pyx_t_float_complex)_Complex_I;
+    }
+  #endif
+#else
+    static CYTHON_INLINE __pyx_t_float_complex __pyx_t_float_complex_from_parts(float x, float y) {
+      __pyx_t_float_complex z;
+      z.real = x;
+      z.imag = y;
+      return z;
+    }
+#endif
+
+#if CYTHON_CCOMPLEX
+#else
+    static CYTHON_INLINE int __Pyx_c_eqf(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+       return (a.real == b.real) && (a.imag == b.imag);
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_sumf(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real + b.real;
+        z.imag = a.imag + b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_difff(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real - b.real;
+        z.imag = a.imag - b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_prodf(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        z.real = a.real * b.real - a.imag * b.imag;
+        z.imag = a.real * b.imag + a.imag * b.real;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_quotf(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+        __pyx_t_float_complex z;
+        float denom = b.real * b.real + b.imag * b.imag;
+        z.real = (a.real * b.real + a.imag * b.imag) / denom;
+        z.imag = (a.imag * b.real - a.real * b.imag) / denom;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_negf(__pyx_t_float_complex a) {
+        __pyx_t_float_complex z;
+        z.real = -a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    static CYTHON_INLINE int __Pyx_c_is_zerof(__pyx_t_float_complex a) {
+       return (a.real == 0) && (a.imag == 0);
+    }
+    static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_conjf(__pyx_t_float_complex a) {
+        __pyx_t_float_complex z;
+        z.real =  a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    #if 1
+        static CYTHON_INLINE float __Pyx_c_absf(__pyx_t_float_complex z) {
+          #if !defined(HAVE_HYPOT) || defined(_MSC_VER)
+            return sqrtf(z.real*z.real + z.imag*z.imag);
+          #else
+            return hypotf(z.real, z.imag);
+          #endif
+        }
+        static CYTHON_INLINE __pyx_t_float_complex __Pyx_c_powf(__pyx_t_float_complex a, __pyx_t_float_complex b) {
+            __pyx_t_float_complex z;
+            float r, lnr, theta, z_r, z_theta;
+            if (b.imag == 0 && b.real == (int)b.real) {
+                if (b.real < 0) {
+                    float denom = a.real * a.real + a.imag * a.imag;
+                    a.real = a.real / denom;
+                    a.imag = -a.imag / denom;
+                    b.real = -b.real;
+                }
+                switch ((int)b.real) {
+                    case 0:
+                        z.real = 1;
+                        z.imag = 0;
+                        return z;
+                    case 1:
+                        return a;
+                    case 2:
+                        z = __Pyx_c_prodf(a, a);
+                        return __Pyx_c_prodf(a, a);
+                    case 3:
+                        z = __Pyx_c_prodf(a, a);
+                        return __Pyx_c_prodf(z, a);
+                    case 4:
+                        z = __Pyx_c_prodf(a, a);
+                        return __Pyx_c_prodf(z, z);
+                }
+            }
+            if (a.imag == 0) {
+                if (a.real == 0) {
+                    return a;
+                }
+                r = a.real;
+                theta = 0;
+            } else {
+                r = __Pyx_c_absf(a);
+                theta = atan2f(a.imag, a.real);
+            }
+            lnr = logf(r);
+            z_r = expf(lnr * b.real - theta * b.imag);
+            z_theta = theta * b.real + lnr * b.imag;
+            z.real = z_r * cosf(z_theta);
+            z.imag = z_r * sinf(z_theta);
+            return z;
+        }
+    #endif
+#endif
+
+#if CYTHON_CCOMPLEX
+  #ifdef __cplusplus
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      return ::std::complex< double >(x, y);
+    }
+  #else
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      return x + y*(__pyx_t_double_complex)_Complex_I;
+    }
+  #endif
+#else
+    static CYTHON_INLINE __pyx_t_double_complex __pyx_t_double_complex_from_parts(double x, double y) {
+      __pyx_t_double_complex z;
+      z.real = x;
+      z.imag = y;
+      return z;
+    }
+#endif
+
+#if CYTHON_CCOMPLEX
+#else
+    static CYTHON_INLINE int __Pyx_c_eq(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+       return (a.real == b.real) && (a.imag == b.imag);
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_sum(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real + b.real;
+        z.imag = a.imag + b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_diff(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real - b.real;
+        z.imag = a.imag - b.imag;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_prod(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        z.real = a.real * b.real - a.imag * b.imag;
+        z.imag = a.real * b.imag + a.imag * b.real;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_quot(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+        __pyx_t_double_complex z;
+        double denom = b.real * b.real + b.imag * b.imag;
+        z.real = (a.real * b.real + a.imag * b.imag) / denom;
+        z.imag = (a.imag * b.real - a.real * b.imag) / denom;
+        return z;
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_neg(__pyx_t_double_complex a) {
+        __pyx_t_double_complex z;
+        z.real = -a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    static CYTHON_INLINE int __Pyx_c_is_zero(__pyx_t_double_complex a) {
+       return (a.real == 0) && (a.imag == 0);
+    }
+    static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_conj(__pyx_t_double_complex a) {
+        __pyx_t_double_complex z;
+        z.real =  a.real;
+        z.imag = -a.imag;
+        return z;
+    }
+    #if 1
+        static CYTHON_INLINE double __Pyx_c_abs(__pyx_t_double_complex z) {
+          #if !defined(HAVE_HYPOT) || defined(_MSC_VER)
+            return sqrt(z.real*z.real + z.imag*z.imag);
+          #else
+            return hypot(z.real, z.imag);
+          #endif
+        }
+        static CYTHON_INLINE __pyx_t_double_complex __Pyx_c_pow(__pyx_t_double_complex a, __pyx_t_double_complex b) {
+            __pyx_t_double_complex z;
+            double r, lnr, theta, z_r, z_theta;
+            if (b.imag == 0 && b.real == (int)b.real) {
+                if (b.real < 0) {
+                    double denom = a.real * a.real + a.imag * a.imag;
+                    a.real = a.real / denom;
+                    a.imag = -a.imag / denom;
+                    b.real = -b.real;
+                }
+                switch ((int)b.real) {
+                    case 0:
+                        z.real = 1;
+                        z.imag = 0;
+                        return z;
+                    case 1:
+                        return a;
+                    case 2:
+                        z = __Pyx_c_prod(a, a);
+                        return __Pyx_c_prod(a, a);
+                    case 3:
+                        z = __Pyx_c_prod(a, a);
+                        return __Pyx_c_prod(z, a);
+                    case 4:
+                        z = __Pyx_c_prod(a, a);
+                        return __Pyx_c_prod(z, z);
+                }
+            }
+            if (a.imag == 0) {
+                if (a.real == 0) {
+                    return a;
+                }
+                r = a.real;
+                theta = 0;
+            } else {
+                r = __Pyx_c_abs(a);
+                theta = atan2(a.imag, a.real);
+            }
+            lnr = log(r);
+            z_r = exp(lnr * b.real - theta * b.imag);
+            z_theta = theta * b.real + lnr * b.imag;
+            z.real = z_r * cos(z_theta);
+            z.imag = z_r * sin(z_theta);
+            return z;
+        }
+    #endif
+#endif
+
+#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
+ #if CYTHON_USE_PYLONG_INTERNALS
+  #include "longintrepr.h"
+ #endif
+#endif
+static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *x) {
+    const int neg_one = (int) -1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(int) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(int, long, PyInt_AS_LONG)
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                PyErr_SetString(PyExc_OverflowError,
+                                "can't convert negative value to int");
+                return (int) -1;
+            }
+            return (int) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
+ #if CYTHON_USE_PYLONG_INTERNALS
+            if (sizeof(digit) <= sizeof(int)) {
+                switch (Py_SIZE(x)) {
+                    case  0: return 0;
+                    case  1: return (int) ((PyLongObject*)x)->ob_digit[0];
+                }
+            }
+ #endif
+#endif
+            if (unlikely(Py_SIZE(x) < 0)) {
+                PyErr_SetString(PyExc_OverflowError,
+                                "can't convert negative value to int");
+                return (int) -1;
+            }
+            if (sizeof(int) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT(int, unsigned long, PyLong_AsUnsignedLong)
+            } else if (sizeof(int) <= sizeof(unsigned long long)) {
+                __PYX_VERIFY_RETURN_INT(int, unsigned long long, PyLong_AsUnsignedLongLong)
+            }
+        } else {
+#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
+ #if CYTHON_USE_PYLONG_INTERNALS
+            if (sizeof(digit) <= sizeof(int)) {
+                switch (Py_SIZE(x)) {
+                    case  0: return 0;
+                    case  1: return +(int) ((PyLongObject*)x)->ob_digit[0];
+                    case -1: return -(int) ((PyLongObject*)x)->ob_digit[0];
+                }
+            }
+ #endif
+#endif
+            if (sizeof(int) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT(int, long, PyLong_AsLong)
+            } else if (sizeof(int) <= sizeof(long long)) {
+                __PYX_VERIFY_RETURN_INT(int, long long, PyLong_AsLongLong)
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            int val;
+            PyObject *v = __Pyx_PyNumber_Int(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (int) -1;
+        }
+    } else {
+        int val;
+        PyObject *tmp = __Pyx_PyNumber_Int(x);
+        if (!tmp) return (int) -1;
+        val = __Pyx_PyInt_As_int(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+}
+
+static CYTHON_INLINE PyObject* __Pyx_PyInt_From_long(long value) {
+    const long neg_one = (long) -1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+    if (is_unsigned) {
+        if (sizeof(long) < sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(long) <= sizeof(unsigned long)) {
+            return PyLong_FromUnsignedLong((unsigned long) value);
+        } else if (sizeof(long) <= sizeof(unsigned long long)) {
+            return PyLong_FromUnsignedLongLong((unsigned long long) value);
+        }
+    } else {
+        if (sizeof(long) <= sizeof(long)) {
+            return PyInt_FromLong((long) value);
+        } else if (sizeof(long) <= sizeof(long long)) {
+            return PyLong_FromLongLong((long long) value);
+        }
+    }
+    {
+        int one = 1; int little = (int)*(unsigned char *)&one;
+        unsigned char *bytes = (unsigned char *)&value;
+        return _PyLong_FromByteArray(bytes, sizeof(long),
+                                     little, !is_unsigned);
+    }
+}
+
+#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
+ #if CYTHON_USE_PYLONG_INTERNALS
+  #include "longintrepr.h"
+ #endif
+#endif
+static CYTHON_INLINE long __Pyx_PyInt_As_long(PyObject *x) {
+    const long neg_one = (long) -1, const_zero = 0;
+    const int is_unsigned = neg_one > const_zero;
+#if PY_MAJOR_VERSION < 3
+    if (likely(PyInt_Check(x))) {
+        if (sizeof(long) < sizeof(long)) {
+            __PYX_VERIFY_RETURN_INT(long, long, PyInt_AS_LONG)
+        } else {
+            long val = PyInt_AS_LONG(x);
+            if (is_unsigned && unlikely(val < 0)) {
+                PyErr_SetString(PyExc_OverflowError,
+                                "can't convert negative value to long");
+                return (long) -1;
+            }
+            return (long) val;
+        }
+    } else
+#endif
+    if (likely(PyLong_Check(x))) {
+        if (is_unsigned) {
+#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
+ #if CYTHON_USE_PYLONG_INTERNALS
+            if (sizeof(digit) <= sizeof(long)) {
+                switch (Py_SIZE(x)) {
+                    case  0: return 0;
+                    case  1: return (long) ((PyLongObject*)x)->ob_digit[0];
+                }
+            }
+ #endif
+#endif
+            if (unlikely(Py_SIZE(x) < 0)) {
+                PyErr_SetString(PyExc_OverflowError,
+                                "can't convert negative value to long");
+                return (long) -1;
+            }
+            if (sizeof(long) <= sizeof(unsigned long)) {
+                __PYX_VERIFY_RETURN_INT(long, unsigned long, PyLong_AsUnsignedLong)
+            } else if (sizeof(long) <= sizeof(unsigned long long)) {
+                __PYX_VERIFY_RETURN_INT(long, unsigned long long, PyLong_AsUnsignedLongLong)
+            }
+        } else {
+#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
+ #if CYTHON_USE_PYLONG_INTERNALS
+            if (sizeof(digit) <= sizeof(long)) {
+                switch (Py_SIZE(x)) {
+                    case  0: return 0;
+                    case  1: return +(long) ((PyLongObject*)x)->ob_digit[0];
+                    case -1: return -(long) ((PyLongObject*)x)->ob_digit[0];
+                }
+            }
+ #endif
+#endif
+            if (sizeof(long) <= sizeof(long)) {
+                __PYX_VERIFY_RETURN_INT(long, long, PyLong_AsLong)
+            } else if (sizeof(long) <= sizeof(long long)) {
+                __PYX_VERIFY_RETURN_INT(long, long long, PyLong_AsLongLong)
+            }
+        }
+        {
+#if CYTHON_COMPILING_IN_PYPY && !defined(_PyLong_AsByteArray)
+            PyErr_SetString(PyExc_RuntimeError,
+                            "_PyLong_AsByteArray() not available in PyPy, cannot convert large numbers");
+#else
+            long val;
+            PyObject *v = __Pyx_PyNumber_Int(x);
+ #if PY_MAJOR_VERSION < 3
+            if (likely(v) && !PyLong_Check(v)) {
+                PyObject *tmp = v;
+                v = PyNumber_Long(tmp);
+                Py_DECREF(tmp);
+            }
+ #endif
+            if (likely(v)) {
+                int one = 1; int is_little = (int)*(unsigned char *)&one;
+                unsigned char *bytes = (unsigned char *)&val;
+                int ret = _PyLong_AsByteArray((PyLongObject *)v,
+                                              bytes, sizeof(val),
+                                              is_little, !is_unsigned);
+                Py_DECREF(v);
+                if (likely(!ret))
+                    return val;
+            }
+#endif
+            return (long) -1;
+        }
+    } else {
+        long val;
+        PyObject *tmp = __Pyx_PyNumber_Int(x);
+        if (!tmp) return (long) -1;
+        val = __Pyx_PyInt_As_long(tmp);
+        Py_DECREF(tmp);
+        return val;
+    }
+}
+
+static int __Pyx_check_binary_version(void) {
+    char ctversion[4], rtversion[4];
+    PyOS_snprintf(ctversion, 4, "%d.%d", PY_MAJOR_VERSION, PY_MINOR_VERSION);
+    PyOS_snprintf(rtversion, 4, "%s", Py_GetVersion());
+    if (ctversion[0] != rtversion[0] || ctversion[2] != rtversion[2]) {
+        char message[200];
+        PyOS_snprintf(message, sizeof(message),
+                      "compiletime version %s of module '%.100s' "
+                      "does not match runtime version %s",
+                      ctversion, __Pyx_MODULE_NAME, rtversion);
+        #if PY_VERSION_HEX < 0x02050000
+        return PyErr_Warn(NULL, message);
+        #else
+        return PyErr_WarnEx(NULL, message, 1);
+        #endif
+    }
+    return 0;
+}
+
+#ifndef __PYX_HAVE_RT_ImportModule
+#define __PYX_HAVE_RT_ImportModule
+static PyObject *__Pyx_ImportModule(const char *name) {
+    PyObject *py_name = 0;
+    PyObject *py_module = 0;
+    py_name = __Pyx_PyIdentifier_FromString(name);
+    if (!py_name)
+        goto bad;
+    py_module = PyImport_Import(py_name);
+    Py_DECREF(py_name);
+    return py_module;
+bad:
+    Py_XDECREF(py_name);
+    return 0;
+}
+#endif
+
+#ifndef __PYX_HAVE_RT_ImportType
+#define __PYX_HAVE_RT_ImportType
+static PyTypeObject *__Pyx_ImportType(const char *module_name, const char *class_name,
+    size_t size, int strict)
+{
+    PyObject *py_module = 0;
+    PyObject *result = 0;
+    PyObject *py_name = 0;
+    char warning[200];
+    Py_ssize_t basicsize;
+#ifdef Py_LIMITED_API
+    PyObject *py_basicsize;
+#endif
+    py_module = __Pyx_ImportModule(module_name);
+    if (!py_module)
+        goto bad;
+    py_name = __Pyx_PyIdentifier_FromString(class_name);
+    if (!py_name)
+        goto bad;
+    result = PyObject_GetAttr(py_module, py_name);
+    Py_DECREF(py_name);
+    py_name = 0;
+    Py_DECREF(py_module);
+    py_module = 0;
+    if (!result)
+        goto bad;
+    if (!PyType_Check(result)) {
+        PyErr_Format(PyExc_TypeError,
+            "%.200s.%.200s is not a type object",
+            module_name, class_name);
+        goto bad;
+    }
+#ifndef Py_LIMITED_API
+    basicsize = ((PyTypeObject *)result)->tp_basicsize;
+#else
+    py_basicsize = PyObject_GetAttrString(result, "__basicsize__");
+    if (!py_basicsize)
+        goto bad;
+    basicsize = PyLong_AsSsize_t(py_basicsize);
+    Py_DECREF(py_basicsize);
+    py_basicsize = 0;
+    if (basicsize == (Py_ssize_t)-1 && PyErr_Occurred())
+        goto bad;
+#endif
+    if (!strict && (size_t)basicsize > size) {
+        PyOS_snprintf(warning, sizeof(warning),
+            "%s.%s size changed, may indicate binary incompatibility",
+            module_name, class_name);
+        #if PY_VERSION_HEX < 0x02050000
+        if (PyErr_Warn(NULL, warning) < 0) goto bad;
+        #else
+        if (PyErr_WarnEx(NULL, warning, 0) < 0) goto bad;
+        #endif
+    }
+    else if ((size_t)basicsize != size) {
+        PyErr_Format(PyExc_ValueError,
+            "%.200s.%.200s has the wrong size, try recompiling",
+            module_name, class_name);
+        goto bad;
+    }
+    return (PyTypeObject *)result;
+bad:
+    Py_XDECREF(py_module);
+    Py_XDECREF(result);
+    return NULL;
+}
+#endif
+
+static int __pyx_bisect_code_objects(__Pyx_CodeObjectCacheEntry* entries, int count, int code_line) {
+    int start = 0, mid = 0, end = count - 1;
+    if (end >= 0 && code_line > entries[end].code_line) {
+        return count;
+    }
+    while (start < end) {
+        mid = (start + end) / 2;
+        if (code_line < entries[mid].code_line) {
+            end = mid;
+        } else if (code_line > entries[mid].code_line) {
+             start = mid + 1;
+        } else {
+            return mid;
+        }
+    }
+    if (code_line <= entries[mid].code_line) {
+        return mid;
+    } else {
+        return mid + 1;
+    }
+}
+static PyCodeObject *__pyx_find_code_object(int code_line) {
+    PyCodeObject* code_object;
+    int pos;
+    if (unlikely(!code_line) || unlikely(!__pyx_code_cache.entries)) {
+        return NULL;
+    }
+    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
+    if (unlikely(pos >= __pyx_code_cache.count) || unlikely(__pyx_code_cache.entries[pos].code_line != code_line)) {
+        return NULL;
+    }
+    code_object = __pyx_code_cache.entries[pos].code_object;
+    Py_INCREF(code_object);
+    return code_object;
+}
+static void __pyx_insert_code_object(int code_line, PyCodeObject* code_object) {
+    int pos, i;
+    __Pyx_CodeObjectCacheEntry* entries = __pyx_code_cache.entries;
+    if (unlikely(!code_line)) {
+        return;
+    }
+    if (unlikely(!entries)) {
+        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Malloc(64*sizeof(__Pyx_CodeObjectCacheEntry));
+        if (likely(entries)) {
+            __pyx_code_cache.entries = entries;
+            __pyx_code_cache.max_count = 64;
+            __pyx_code_cache.count = 1;
+            entries[0].code_line = code_line;
+            entries[0].code_object = code_object;
+            Py_INCREF(code_object);
+        }
+        return;
+    }
+    pos = __pyx_bisect_code_objects(__pyx_code_cache.entries, __pyx_code_cache.count, code_line);
+    if ((pos < __pyx_code_cache.count) && unlikely(__pyx_code_cache.entries[pos].code_line == code_line)) {
+        PyCodeObject* tmp = entries[pos].code_object;
+        entries[pos].code_object = code_object;
+        Py_DECREF(tmp);
+        return;
+    }
+    if (__pyx_code_cache.count == __pyx_code_cache.max_count) {
+        int new_max = __pyx_code_cache.max_count + 64;
+        entries = (__Pyx_CodeObjectCacheEntry*)PyMem_Realloc(
+            __pyx_code_cache.entries, new_max*sizeof(__Pyx_CodeObjectCacheEntry));
+        if (unlikely(!entries)) {
+            return;
+        }
+        __pyx_code_cache.entries = entries;
+        __pyx_code_cache.max_count = new_max;
+    }
+    for (i=__pyx_code_cache.count; i>pos; i--) {
+        entries[i] = entries[i-1];
+    }
+    entries[pos].code_line = code_line;
+    entries[pos].code_object = code_object;
+    __pyx_code_cache.count++;
+    Py_INCREF(code_object);
+}
+
+#include "compile.h"
+#include "frameobject.h"
+#include "traceback.h"
+static PyCodeObject* __Pyx_CreateCodeObjectForTraceback(
+            const char *funcname, int c_line,
+            int py_line, const char *filename) {
+    PyCodeObject *py_code = 0;
+    PyObject *py_srcfile = 0;
+    PyObject *py_funcname = 0;
+    #if PY_MAJOR_VERSION < 3
+    py_srcfile = PyString_FromString(filename);
+    #else
+    py_srcfile = PyUnicode_FromString(filename);
+    #endif
+    if (!py_srcfile) goto bad;
+    if (c_line) {
+        #if PY_MAJOR_VERSION < 3
+        py_funcname = PyString_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
+        #else
+        py_funcname = PyUnicode_FromFormat( "%s (%s:%d)", funcname, __pyx_cfilenm, c_line);
+        #endif
+    }
+    else {
+        #if PY_MAJOR_VERSION < 3
+        py_funcname = PyString_FromString(funcname);
+        #else
+        py_funcname = PyUnicode_FromString(funcname);
+        #endif
+    }
+    if (!py_funcname) goto bad;
+    py_code = __Pyx_PyCode_New(
+        0,            /*int argcount,*/
+        0,            /*int kwonlyargcount,*/
+        0,            /*int nlocals,*/
+        0,            /*int stacksize,*/
+        0,            /*int flags,*/
+        __pyx_empty_bytes, /*PyObject *code,*/
+        __pyx_empty_tuple, /*PyObject *consts,*/
+        __pyx_empty_tuple, /*PyObject *names,*/
+        __pyx_empty_tuple, /*PyObject *varnames,*/
+        __pyx_empty_tuple, /*PyObject *freevars,*/
+        __pyx_empty_tuple, /*PyObject *cellvars,*/
+        py_srcfile,   /*PyObject *filename,*/
+        py_funcname,  /*PyObject *name,*/
+        py_line,      /*int firstlineno,*/
+        __pyx_empty_bytes  /*PyObject *lnotab*/
+    );
+    Py_DECREF(py_srcfile);
+    Py_DECREF(py_funcname);
+    return py_code;
+bad:
+    Py_XDECREF(py_srcfile);
+    Py_XDECREF(py_funcname);
+    return NULL;
+}
+static void __Pyx_AddTraceback(const char *funcname, int c_line,
+                               int py_line, const char *filename) {
+    PyCodeObject *py_code = 0;
+    PyObject *py_globals = 0;
+    PyFrameObject *py_frame = 0;
+    py_code = __pyx_find_code_object(c_line ? c_line : py_line);
+    if (!py_code) {
+        py_code = __Pyx_CreateCodeObjectForTraceback(
+            funcname, c_line, py_line, filename);
+        if (!py_code) goto bad;
+        __pyx_insert_code_object(c_line ? c_line : py_line, py_code);
+    }
+    py_globals = PyModule_GetDict(__pyx_m);
+    if (!py_globals) goto bad;
+    py_frame = PyFrame_New(
+        PyThreadState_GET(), /*PyThreadState *tstate,*/
+        py_code,             /*PyCodeObject *code,*/
+        py_globals,          /*PyObject *globals,*/
+        0                    /*PyObject *locals*/
+    );
+    if (!py_frame) goto bad;
+    py_frame->f_lineno = py_line;
+    PyTraceBack_Here(py_frame);
+bad:
+    Py_XDECREF(py_code);
+    Py_XDECREF(py_frame);
+}
+
+static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
+    while (t->p) {
+        #if PY_MAJOR_VERSION < 3
+        if (t->is_unicode) {
+            *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);
+        } else if (t->intern) {
+            *t->p = PyString_InternFromString(t->s);
+        } else {
+            *t->p = PyString_FromStringAndSize(t->s, t->n - 1);
+        }
+        #else  /* Python 3+ has unicode identifiers */
+        if (t->is_unicode | t->is_str) {
+            if (t->intern) {
+                *t->p = PyUnicode_InternFromString(t->s);
+            } else if (t->encoding) {
+                *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL);
+            } else {
+                *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1);
+            }
+        } else {
+            *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1);
+        }
+        #endif
+        if (!*t->p)
+            return -1;
+        ++t;
+    }
+    return 0;
+}
+
+static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(char* c_str) {
+    return __Pyx_PyUnicode_FromStringAndSize(c_str, strlen(c_str));
+}
+static CYTHON_INLINE char* __Pyx_PyObject_AsString(PyObject* o) {
+    Py_ssize_t ignore;
+    return __Pyx_PyObject_AsStringAndSize(o, &ignore);
+}
+static CYTHON_INLINE char* __Pyx_PyObject_AsStringAndSize(PyObject* o, Py_ssize_t *length) {
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT
+    if (
+#if PY_MAJOR_VERSION < 3 && __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+            __Pyx_sys_getdefaultencoding_not_ascii &&
+#endif
+            PyUnicode_Check(o)) {
+#if PY_VERSION_HEX < 0x03030000
+        char* defenc_c;
+        PyObject* defenc = _PyUnicode_AsDefaultEncodedString(o, NULL);
+        if (!defenc) return NULL;
+        defenc_c = PyBytes_AS_STRING(defenc);
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+        {
+            char* end = defenc_c + PyBytes_GET_SIZE(defenc);
+            char* c;
+            for (c = defenc_c; c < end; c++) {
+                if ((unsigned char) (*c) >= 128) {
+                    PyUnicode_AsASCIIString(o);
+                    return NULL;
+                }
+            }
+        }
+#endif /*__PYX_DEFAULT_STRING_ENCODING_IS_ASCII*/
+        *length = PyBytes_GET_SIZE(defenc);
+        return defenc_c;
+#else /* PY_VERSION_HEX < 0x03030000 */
+        if (PyUnicode_READY(o) == -1) return NULL;
+#if __PYX_DEFAULT_STRING_ENCODING_IS_ASCII
+        if (PyUnicode_IS_ASCII(o)) {
+            *length = PyUnicode_GET_DATA_SIZE(o);
+            return PyUnicode_AsUTF8(o);
+        } else {
+            PyUnicode_AsASCIIString(o);
+            return NULL;
+        }
+#else /* __PYX_DEFAULT_STRING_ENCODING_IS_ASCII */
+        return PyUnicode_AsUTF8AndSize(o, length);
+#endif /* __PYX_DEFAULT_STRING_ENCODING_IS_ASCII */
+#endif /* PY_VERSION_HEX < 0x03030000 */
+    } else
+#endif /* __PYX_DEFAULT_STRING_ENCODING_IS_ASCII  || __PYX_DEFAULT_STRING_ENCODING_IS_DEFAULT */
+#if !CYTHON_COMPILING_IN_PYPY
+#if PY_VERSION_HEX >= 0x02060000
+    if (PyByteArray_Check(o)) {
+        *length = PyByteArray_GET_SIZE(o);
+        return PyByteArray_AS_STRING(o);
+    } else
+#endif
+#endif
+    {
+        char* result;
+        int r = PyBytes_AsStringAndSize(o, &result, length);
+        if (unlikely(r < 0)) {
+            return NULL;
+        } else {
+            return result;
+        }
+    }
+}
+static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject* x) {
+   int is_true = x == Py_True;
+   if (is_true | (x == Py_False) | (x == Py_None)) return is_true;
+   else return PyObject_IsTrue(x);
+}
+static CYTHON_INLINE PyObject* __Pyx_PyNumber_Int(PyObject* x) {
+  PyNumberMethods *m;
+  const char *name = NULL;
+  PyObject *res = NULL;
+#if PY_MAJOR_VERSION < 3
+  if (PyInt_Check(x) || PyLong_Check(x))
+#else
+  if (PyLong_Check(x))
+#endif
+    return Py_INCREF(x), x;
+  m = Py_TYPE(x)->tp_as_number;
+#if PY_MAJOR_VERSION < 3
+  if (m && m->nb_int) {
+    name = "int";
+    res = PyNumber_Int(x);
+  }
+  else if (m && m->nb_long) {
+    name = "long";
+    res = PyNumber_Long(x);
+  }
+#else
+  if (m && m->nb_int) {
+    name = "int";
+    res = PyNumber_Long(x);
+  }
+#endif
+  if (res) {
+#if PY_MAJOR_VERSION < 3
+    if (!PyInt_Check(res) && !PyLong_Check(res)) {
+#else
+    if (!PyLong_Check(res)) {
+#endif
+      PyErr_Format(PyExc_TypeError,
+                   "__%.4s__ returned non-%.4s (type %.200s)",
+                   name, name, Py_TYPE(res)->tp_name);
+      Py_DECREF(res);
+      return NULL;
+    }
+  }
+  else if (!PyErr_Occurred()) {
+    PyErr_SetString(PyExc_TypeError,
+                    "an integer is required");
+  }
+  return res;
+}
+#if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
+ #if CYTHON_USE_PYLONG_INTERNALS
+  #include "longintrepr.h"
+ #endif
+#endif
+static CYTHON_INLINE Py_ssize_t __Pyx_PyIndex_AsSsize_t(PyObject* b) {
+  Py_ssize_t ival;
+  PyObject *x;
+#if PY_MAJOR_VERSION < 3
+  if (likely(PyInt_CheckExact(b)))
+      return PyInt_AS_LONG(b);
+#endif
+  if (likely(PyLong_CheckExact(b))) {
+    #if CYTHON_COMPILING_IN_CPYTHON && PY_MAJOR_VERSION >= 3
+     #if CYTHON_USE_PYLONG_INTERNALS
+       switch (Py_SIZE(b)) {
+       case -1: return -(sdigit)((PyLongObject*)b)->ob_digit[0];
+       case  0: return 0;
+       case  1: return ((PyLongObject*)b)->ob_digit[0];
+       }
+     #endif
+    #endif
+  #if PY_VERSION_HEX < 0x02060000
+    return PyInt_AsSsize_t(b);
+  #else
+    return PyLong_AsSsize_t(b);
+  #endif
+  }
+  x = PyNumber_Index(b);
+  if (!x) return -1;
+  ival = PyInt_AsSsize_t(x);
+  Py_DECREF(x);
+  return ival;
+}
+static CYTHON_INLINE PyObject * __Pyx_PyInt_FromSize_t(size_t ival) {
+#if PY_VERSION_HEX < 0x02050000
+   if (ival <= LONG_MAX)
+       return PyInt_FromLong((long)ival);
+   else {
+       unsigned char *bytes = (unsigned char *) &ival;
+       int one = 1; int little = (int)*(unsigned char*)&one;
+       return _PyLong_FromByteArray(bytes, sizeof(size_t), little, 0);
+   }
+#else
+   return PyInt_FromSize_t(ival);
+#endif
+}
+
+
+#endif /* Py_PYTHON_H */
diff --git a/src/tools/voc_eval_lib/nms/gpu_nms.hpp b/src/tools/voc_eval_lib/nms/gpu_nms.hpp
new file mode 100644
index 0000000..68b6d42
--- /dev/null
+++ b/src/tools/voc_eval_lib/nms/gpu_nms.hpp
@@ -0,0 +1,2 @@
+void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
+          int boxes_dim, float nms_overlap_thresh, int device_id);
diff --git a/src/tools/voc_eval_lib/nms/gpu_nms.pyx b/src/tools/voc_eval_lib/nms/gpu_nms.pyx
new file mode 100644
index 0000000..59d84af
--- /dev/null
+++ b/src/tools/voc_eval_lib/nms/gpu_nms.pyx
@@ -0,0 +1,31 @@
+# --------------------------------------------------------
+# Faster R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import numpy as np
+cimport numpy as np
+
+assert sizeof(int) == sizeof(np.int32_t)
+
+cdef extern from "gpu_nms.hpp":
+    void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int)
+
+def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,
+            np.int32_t device_id=0):
+    cdef int boxes_num = dets.shape[0]
+    cdef int boxes_dim = dets.shape[1]
+    cdef int num_out
+    cdef np.ndarray[np.int32_t, ndim=1] \
+        keep = np.zeros(boxes_num, dtype=np.int32)
+    cdef np.ndarray[np.float32_t, ndim=1] \
+        scores = dets[:, 4]
+    cdef np.ndarray[np.int_t, ndim=1] \
+        order = scores.argsort()[::-1]
+    cdef np.ndarray[np.float32_t, ndim=2] \
+        sorted_dets = dets[order, :]
+    _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id)
+    keep = keep[:num_out]
+    return list(order[keep])
diff --git a/src/tools/voc_eval_lib/nms/nms_kernel.cu b/src/tools/voc_eval_lib/nms/nms_kernel.cu
new file mode 100644
index 0000000..038a590
--- /dev/null
+++ b/src/tools/voc_eval_lib/nms/nms_kernel.cu
@@ -0,0 +1,144 @@
+// ------------------------------------------------------------------
+// Faster R-CNN
+// Copyright (c) 2015 Microsoft
+// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
+// Written by Shaoqing Ren
+// ------------------------------------------------------------------
+
+#include "gpu_nms.hpp"
+#include <vector>
+#include <iostream>
+
+#define CUDA_CHECK(condition) \
+  /* Code block avoids redefinition of cudaError_t error */ \
+  do { \
+    cudaError_t error = condition; \
+    if (error != cudaSuccess) { \
+      std::cout << cudaGetErrorString(error) << std::endl; \
+    } \
+  } while (0)
+
+#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+
+__device__ inline float devIoU(float const * const a, float const * const b) {
+  float left = max(a[0], b[0]), right = min(a[2], b[2]);
+  float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
+  float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
+  float interS = width * height;
+  float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
+  float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+  return interS / (Sa + Sb - interS);
+}
+
+__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
+                           const float *dev_boxes, unsigned long long *dev_mask) {
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+
+  const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+  const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+  __shared__ float block_boxes[threadsPerBlock * 5];
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 5 + 0] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
+    block_boxes[threadIdx.x * 5 + 1] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
+    block_boxes[threadIdx.x * 5 + 2] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
+    block_boxes[threadIdx.x * 5 + 3] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
+    block_boxes[threadIdx.x * 5 + 4] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+    const float *cur_box = dev_boxes + cur_box_idx * 5;
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+void _set_device(int device_id) {
+  int current_device;
+  CUDA_CHECK(cudaGetDevice(&current_device));
+  if (current_device == device_id) {
+    return;
+  }
+  // The call to cudaSetDevice must come before any calls to Get, which
+  // may perform initialization using the GPU.
+  CUDA_CHECK(cudaSetDevice(device_id));
+}
+
+void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
+          int boxes_dim, float nms_overlap_thresh, int device_id) {
+  _set_device(device_id);
+
+  float* boxes_dev = NULL;
+  unsigned long long* mask_dev = NULL;
+
+  const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
+
+  CUDA_CHECK(cudaMalloc(&boxes_dev,
+                        boxes_num * boxes_dim * sizeof(float)));
+  CUDA_CHECK(cudaMemcpy(boxes_dev,
+                        boxes_host,
+                        boxes_num * boxes_dim * sizeof(float),
+                        cudaMemcpyHostToDevice));
+
+  CUDA_CHECK(cudaMalloc(&mask_dev,
+                        boxes_num * col_blocks * sizeof(unsigned long long)));
+
+  dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
+              DIVUP(boxes_num, threadsPerBlock));
+  dim3 threads(threadsPerBlock);
+  nms_kernel<<<blocks, threads>>>(boxes_num,
+                                  nms_overlap_thresh,
+                                  boxes_dev,
+                                  mask_dev);
+
+  std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
+  CUDA_CHECK(cudaMemcpy(&mask_host[0],
+                        mask_dev,
+                        sizeof(unsigned long long) * boxes_num * col_blocks,
+                        cudaMemcpyDeviceToHost));
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  int num_to_keep = 0;
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep_out[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+  *num_out = num_to_keep;
+
+  CUDA_CHECK(cudaFree(boxes_dev));
+  CUDA_CHECK(cudaFree(mask_dev));
+}
diff --git a/src/tools/voc_eval_lib/nms/py_cpu_nms.py b/src/tools/voc_eval_lib/nms/py_cpu_nms.py
new file mode 100644
index 0000000..54e7b25
--- /dev/null
+++ b/src/tools/voc_eval_lib/nms/py_cpu_nms.py
@@ -0,0 +1,38 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import numpy as np
+
+def py_cpu_nms(dets, thresh):
+    """Pure Python NMS baseline."""
+    x1 = dets[:, 0]
+    y1 = dets[:, 1]
+    x2 = dets[:, 2]
+    y2 = dets[:, 3]
+    scores = dets[:, 4]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= thresh)[0]
+        order = order[inds + 1]
+
+    return keep
diff --git a/src/tools/voc_eval_lib/setup.py b/src/tools/voc_eval_lib/setup.py
new file mode 100644
index 0000000..6693793
--- /dev/null
+++ b/src/tools/voc_eval_lib/setup.py
@@ -0,0 +1,144 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import os
+from os.path import join as pjoin
+import numpy as np
+from distutils.core import setup
+from distutils.extension import Extension
+from Cython.Distutils import build_ext
+
+def find_in_path(name, path):
+    "Find a file in a search path"
+    #adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
+    for dir in path.split(os.pathsep):
+        binpath = pjoin(dir, name)
+        if os.path.exists(binpath):
+            return os.path.abspath(binpath)
+    return None
+
+def locate_cuda():
+    """Locate the CUDA environment on the system
+
+    Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
+    and values giving the absolute path to each directory.
+
+    Starts by looking for the CUDAHOME env variable. If not found, everything
+    is based on finding 'nvcc' in the PATH.
+    """
+
+    # first check if the CUDAHOME env variable is in use
+    if 'CUDAHOME' in os.environ:
+        home = os.environ['CUDAHOME']
+        nvcc = pjoin(home, 'bin', 'nvcc')
+    else:
+        # otherwise, search the PATH for NVCC
+        default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin')
+        nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path)
+        if nvcc is None:
+            raise EnvironmentError('The nvcc binary could not be '
+                'located in your $PATH. Either add it to your path, or set $CUDAHOME')
+        home = os.path.dirname(os.path.dirname(nvcc))
+
+    cudaconfig = {'home':home, 'nvcc':nvcc,
+                  'include': pjoin(home, 'include'),
+                  'lib64': pjoin(home, 'lib64')}
+    for k, v in cudaconfig.items():
+        if not os.path.exists(v):
+            raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v))
+
+    return cudaconfig
+CUDA = locate_cuda()
+
+# Obtain the numpy include directory.  This logic works across numpy versions.
+try:
+    numpy_include = np.get_include()
+except AttributeError:
+    numpy_include = np.get_numpy_include()
+
+def customize_compiler_for_nvcc(self):
+    """inject deep into distutils to customize how the dispatch
+    to gcc/nvcc works.
+
+    If you subclass UnixCCompiler, it's not trivial to get your subclass
+    injected in, and still have the right customizations (i.e.
+    distutils.sysconfig.customize_compiler) run on it. So instead of going
+    the OO route, I have this. Note, it's kindof like a wierd functional
+    subclassing going on."""
+
+    # tell the compiler it can processes .cu
+    self.src_extensions.append('.cu')
+
+    # save references to the default compiler_so and _comple methods
+    default_compiler_so = self.compiler_so
+    super = self._compile
+
+    # now redefine the _compile method. This gets executed for each
+    # object but distutils doesn't have the ability to change compilers
+    # based on source extension: we add it.
+    def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
+        print(extra_postargs)
+        if os.path.splitext(src)[1] == '.cu':
+            # use the cuda for .cu files
+            self.set_executable('compiler_so', CUDA['nvcc'])
+            # use only a subset of the extra_postargs, which are 1-1 translated
+            # from the extra_compile_args in the Extension class
+            postargs = extra_postargs['nvcc']
+        else:
+            postargs = extra_postargs['gcc']
+
+        super(obj, src, ext, cc_args, postargs, pp_opts)
+        # reset the default compiler_so, which we might have changed for cuda
+        self.compiler_so = default_compiler_so
+
+    # inject our redefined _compile method into the class
+    self._compile = _compile
+
+# run the customize_compiler
+class custom_build_ext(build_ext):
+    def build_extensions(self):
+        customize_compiler_for_nvcc(self.compiler)
+        build_ext.build_extensions(self)
+
+ext_modules = [
+    Extension(
+        "utils.cython_bbox",
+        ["utils/bbox.pyx"],
+        extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
+        include_dirs = [numpy_include]
+    ),
+    Extension(
+        "nms.cpu_nms",
+        ["nms/cpu_nms.pyx"],
+        extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
+        include_dirs = [numpy_include]
+    ),
+    Extension('nms.gpu_nms',
+        ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'],
+        library_dirs=[CUDA['lib64']],
+        libraries=['cudart'],
+        language='c++',
+        runtime_library_dirs=[CUDA['lib64']],
+        # this syntax is specific to this build system
+        # we're only going to use certain compiler args with nvcc and not with gcc
+        # the implementation of this trick is in customize_compiler() below
+        extra_compile_args={'gcc': ["-Wno-unused-function"],
+                            'nvcc': ['-arch=sm_61',
+                                     '--ptxas-options=-v',
+                                     '-c',
+                                     '--compiler-options',
+                                     "'-fPIC'"]},
+        include_dirs = [numpy_include, CUDA['include']]
+    )
+]
+
+setup(
+    name='tf_faster_rcnn',
+    ext_modules=ext_modules,
+    # inject our custom trigger
+    cmdclass={'build_ext': custom_build_ext},
+)
diff --git a/src/tools/voc_eval_lib/utils/.gitignore b/src/tools/voc_eval_lib/utils/.gitignore
new file mode 100644
index 0000000..3347b69
--- /dev/null
+++ b/src/tools/voc_eval_lib/utils/.gitignore
@@ -0,0 +1,4 @@
+*.c
+*.cpp
+*.h
+*.hpp
diff --git a/src/tools/voc_eval_lib/utils/__init__.py b/src/tools/voc_eval_lib/utils/__init__.py
new file mode 100644
index 0000000..7ba6a65
--- /dev/null
+++ b/src/tools/voc_eval_lib/utils/__init__.py
@@ -0,0 +1,6 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
diff --git a/src/tools/voc_eval_lib/utils/bbox.pyx b/src/tools/voc_eval_lib/utils/bbox.pyx
new file mode 100644
index 0000000..0f9c696
--- /dev/null
+++ b/src/tools/voc_eval_lib/utils/bbox.pyx
@@ -0,0 +1,56 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Sergey Karayev
+# --------------------------------------------------------
+
+cimport cython
+import numpy as np
+cimport numpy as np
+
+DTYPE = np.float
+ctypedef np.float_t DTYPE_t
+
+def bbox_overlaps(
+        np.ndarray[DTYPE_t, ndim=2] boxes,
+        np.ndarray[DTYPE_t, ndim=2] query_boxes):
+    """
+    Parameters
+    ----------
+    boxes: (N, 4) ndarray of float
+    query_boxes: (K, 4) ndarray of float
+    Returns
+    -------
+    overlaps: (N, K) ndarray of overlap between boxes and query_boxes
+    """
+    cdef unsigned int N = boxes.shape[0]
+    cdef unsigned int K = query_boxes.shape[0]
+    cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
+    cdef DTYPE_t iw, ih, box_area
+    cdef DTYPE_t ua
+    cdef unsigned int k, n
+    for k in range(K):
+        box_area = (
+            (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
+            (query_boxes[k, 3] - query_boxes[k, 1] + 1)
+        )
+        for n in range(N):
+            iw = (
+                min(boxes[n, 2], query_boxes[k, 2]) -
+                max(boxes[n, 0], query_boxes[k, 0]) + 1
+            )
+            if iw > 0:
+                ih = (
+                    min(boxes[n, 3], query_boxes[k, 3]) -
+                    max(boxes[n, 1], query_boxes[k, 1]) + 1
+                )
+                if ih > 0:
+                    ua = float(
+                        (boxes[n, 2] - boxes[n, 0] + 1) *
+                        (boxes[n, 3] - boxes[n, 1] + 1) +
+                        box_area - iw * ih
+                    )
+                    overlaps[n, k] = iw * ih / ua
+    return overlaps
+
diff --git a/src/tools/voc_eval_lib/utils/blob.py b/src/tools/voc_eval_lib/utils/blob.py
new file mode 100644
index 0000000..0399fdd
--- /dev/null
+++ b/src/tools/voc_eval_lib/utils/blob.py
@@ -0,0 +1,47 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+"""Blob helper functions."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import cv2
+
+
+def im_list_to_blob(ims):
+  """Convert a list of images into a network input.
+
+  Assumes images are already prepared (means subtracted, BGR order, ...).
+  """
+  max_shape = np.array([im.shape for im in ims]).max(axis=0)
+  num_images = len(ims)
+  blob = np.zeros((num_images, max_shape[0], max_shape[1], 3),
+                  dtype=np.float32)
+  for i in range(num_images):
+    im = ims[i]
+    blob[i, 0:im.shape[0], 0:im.shape[1], :] = im
+
+  return blob
+
+
+def prep_im_for_blob(im, pixel_means, target_size, max_size):
+  """Mean subtract and scale an image for use in a blob."""
+  im = im.astype(np.float32, copy=False)
+  im -= pixel_means
+  im_shape = im.shape
+  im_size_min = np.min(im_shape[0:2])
+  im_size_max = np.max(im_shape[0:2])
+  im_scale = float(target_size) / float(im_size_min)
+  # Prevent the biggest axis from being more than MAX_SIZE
+  if np.round(im_scale * im_size_max) > max_size:
+    im_scale = float(max_size) / float(im_size_max)
+  im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale,
+                  interpolation=cv2.INTER_LINEAR)
+
+  return im, im_scale
diff --git a/src/tools/voc_eval_lib/utils/timer.py b/src/tools/voc_eval_lib/utils/timer.py
new file mode 100644
index 0000000..dacc942
--- /dev/null
+++ b/src/tools/voc_eval_lib/utils/timer.py
@@ -0,0 +1,32 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import time
+
+class Timer(object):
+    """A simple timer."""
+    def __init__(self):
+        self.total_time = 0.
+        self.calls = 0
+        self.start_time = 0.
+        self.diff = 0.
+        self.average_time = 0.
+
+    def tic(self):
+        # using time.time instead of time.clock because time time.clock
+        # does not normalize for multithreading
+        self.start_time = time.time()
+
+    def toc(self, average=True):
+        self.diff = time.time() - self.start_time
+        self.total_time += self.diff
+        self.calls += 1
+        self.average_time = self.total_time / self.calls
+        if average:
+            return self.average_time
+        else:
+            return self.diff
diff --git a/src/tools/voc_eval_lib/utils/visualization.py b/src/tools/voc_eval_lib/utils/visualization.py
new file mode 100644
index 0000000..a34c843
--- /dev/null
+++ b/src/tools/voc_eval_lib/utils/visualization.py
@@ -0,0 +1,89 @@
+# --------------------------------------------------------
+# Tensorflow Faster R-CNN
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Xinlei Chen
+# --------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import range
+import PIL.Image as Image
+import PIL.ImageColor as ImageColor
+import PIL.ImageDraw as ImageDraw
+import PIL.ImageFont as ImageFont
+
+STANDARD_COLORS = [
+    'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque',
+    'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite',
+    'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan',
+    'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange',
+    'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet',
+    'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite',
+    'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod',
+    'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki',
+    'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue',
+    'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey',
+    'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue',
+    'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime',
+    'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid',
+    'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen',
+    'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin',
+    'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed',
+    'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed',
+    'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple',
+    'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown',
+    'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue',
+    'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow',
+    'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White',
+    'WhiteSmoke', 'Yellow', 'YellowGreen'
+]
+
+NUM_COLORS = len(STANDARD_COLORS)
+
+try:
+  FONT = ImageFont.truetype('arial.ttf', 24)
+except IOError:
+  FONT = ImageFont.load_default()
+
+def _draw_single_box(image, xmin, ymin, xmax, ymax, display_str, font, color='black', thickness=4):
+  draw = ImageDraw.Draw(image)
+  (left, right, top, bottom) = (xmin, xmax, ymin, ymax)
+  draw.line([(left, top), (left, bottom), (right, bottom),
+             (right, top), (left, top)], width=thickness, fill=color)
+  text_bottom = bottom
+  # Reverse list and print from bottom to top.
+  text_width, text_height = font.getsize(display_str)
+  margin = np.ceil(0.05 * text_height)
+  draw.rectangle(
+      [(left, text_bottom - text_height - 2 * margin), (left + text_width,
+                                                        text_bottom)],
+      fill=color)
+  draw.text(
+      (left + margin, text_bottom - text_height - margin),
+      display_str,
+      fill='black',
+      font=font)
+
+  return image
+
+def draw_bounding_boxes(image, gt_boxes, im_info):
+  num_boxes = gt_boxes.shape[0]
+  gt_boxes_new = gt_boxes.copy()
+  gt_boxes_new[:,:4] = np.round(gt_boxes_new[:,:4].copy() / im_info[2])
+  disp_image = Image.fromarray(np.uint8(image[0]))
+
+  for i in range(num_boxes):
+    this_class = int(gt_boxes_new[i, 4])
+    disp_image = _draw_single_box(disp_image, 
+                                gt_boxes_new[i, 0],
+                                gt_boxes_new[i, 1],
+                                gt_boxes_new[i, 2],
+                                gt_boxes_new[i, 3],
+                                'N%02d-C%02d' % (i, this_class),
+                                FONT,
+                                color=STANDARD_COLORS[this_class % NUM_COLORS])
+
+  image[0, :] = np.array(disp_image)
+  return image